kernel/events/core.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/idr.h>
  17 #include <linux/file.h>
  18 #include <linux/poll.h>
  19 #include <linux/slab.h>
  20 #include <linux/hash.h>
  21 #include <linux/tick.h>
  22 #include <linux/sysfs.h>
  23 #include <linux/dcache.h>
  24 #include <linux/percpu.h>
  25 #include <linux/ptrace.h>
  26 #include <linux/reboot.h>
  27 #include <linux/vmstat.h>
  28 #include <linux/device.h>
  29 #include <linux/export.h>
  30 #include <linux/vmalloc.h>
  31 #include <linux/hardirq.h>
  32 #include <linux/rculist.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/syscalls.h>
  35 #include <linux/anon_inodes.h>
  36 #include <linux/kernel_stat.h>
  37 #include <linux/cgroup.h>
  38 #include <linux/perf_event.h>
  39 #include <linux/ftrace_event.h>
  40 #include <linux/hw_breakpoint.h>
  41 #include <linux/mm_types.h>
  42 #include <linux/module.h>
  43 #include <linux/mman.h>
  44 #include <linux/compat.h>
  45 #include <linux/bpf.h>
  46 #include <linux/filter.h>
  47
  48 #include "internal.h"
  49
  50 #include <asm/irq_regs.h>
  51
  52 static struct workqueue_struct *perf_wq;
  53
  54 struct remote_function_call {
  55         struct task_struct      *p;
  56         int                     (*func)(void *info);
  57         void                    *info;
  58         int                     ret;
  59 };
  60
  61 static void remote_function(void *data)
  62 {
  63         struct remote_function_call *tfc = data;
  64         struct task_struct *p = tfc->p;
  65
  66         if (p) {
  67                 tfc->ret = -EAGAIN;
  68                 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
  69                         return;
  70         }
  71
  72         tfc->ret = tfc->func(tfc->info);
  73 }
  74
  75 /**
  76  * task_function_call - call a function on the cpu on which a task runs
  77  * @p:          the task to evaluate
  78  * @func:       the function to be called
  79  * @info:       the function call argument
  80  *
  81  * Calls the function @func when the task is currently running. This might
  82  * be on the current CPU, which just calls the function directly
  83  *
  84  * returns: @func return value, or
  85  *          -ESRCH  - when the process isn't running
  86  *          -EAGAIN - when the process moved away
  87  */
  88 static int
  89 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  90 {
  91         struct remote_function_call data = {
  92                 .p      = p,
  93                 .func   = func,
  94                 .info   = info,
  95                 .ret    = -ESRCH, /* No such (running) process */
  96         };
  97
  98         if (task_curr(p))
  99                 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 100
 101         return data.ret;
 102 }
 103
 104 /**
 105  * cpu_function_call - call a function on the cpu
 106  * @func:       the function to be called
 107  * @info:       the function call argument
 108  *
 109  * Calls the function @func on the remote cpu.
 110  *
 111  * returns: @func return value or -ENXIO when the cpu is offline
 112  */
 113 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
 114 {
 115         struct remote_function_call data = {
 116                 .p      = NULL,
 117                 .func   = func,
 118                 .info   = info,
 119                 .ret    = -ENXIO, /* No such CPU */
 120         };
 121
 122         smp_call_function_single(cpu, remote_function, &data, 1);
 123
 124         return data.ret;
 125 }
 126
 127 #define EVENT_OWNER_KERNEL ((void *) -1)
 128
 129 static bool is_kernel_event(struct perf_event *event)
 130 {
 131         return event->owner == EVENT_OWNER_KERNEL;
 132 }
 133
 134 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 135                        PERF_FLAG_FD_OUTPUT  |\
 136                        PERF_FLAG_PID_CGROUP |\
 137                        PERF_FLAG_FD_CLOEXEC)
 138
 139 /*
 140  * branch priv levels that need permission checks
 141  */
 142 #define PERF_SAMPLE_BRANCH_PERM_PLM \
 143         (PERF_SAMPLE_BRANCH_KERNEL |\
 144          PERF_SAMPLE_BRANCH_HV)
 145
 146 enum event_type_t {
 147         EVENT_FLEXIBLE = 0x1,
 148         EVENT_PINNED = 0x2,
 149         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 150 };
 151
 152 /*
 153  * perf_sched_events : >0 events exist
 154  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 155  */
 156 struct static_key_deferred perf_sched_events __read_mostly;
 157 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 158 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 159
 160 static atomic_t nr_mmap_events __read_mostly;
 161 static atomic_t nr_comm_events __read_mostly;
 162 static atomic_t nr_task_events __read_mostly;
 163 static atomic_t nr_freq_events __read_mostly;
 164
 165 static LIST_HEAD(pmus);
 166 static DEFINE_MUTEX(pmus_lock);
 167 static struct srcu_struct pmus_srcu;
 168
 169 /*
 170  * perf event paranoia level:
 171  *  -1 - not paranoid at all
 172  *   0 - disallow raw tracepoint access for unpriv
 173  *   1 - disallow cpu events for unpriv
 174  *   2 - disallow kernel profiling for unpriv
 175  */
 176 int sysctl_perf_event_paranoid __read_mostly = 1;
 177
 178 /* Minimum for 512 kiB + 1 user control page */
 179 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 180
 181 /*
 182  * max perf event sample rate
 183  */
 184 #define DEFAULT_MAX_SAMPLE_RATE         100000
 185 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 186 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
 187
 188 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 189
 190 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 191 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 192
 193 static int perf_sample_allowed_ns __read_mostly =
 194         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 195
 196 void update_perf_cpu_limits(void)
 197 {
 198         u64 tmp = perf_sample_period_ns;
 199
 200         tmp *= sysctl_perf_cpu_time_max_percent;
 201         do_div(tmp, 100);
 202         ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 203 }
 204
 205 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 206
 207 int perf_proc_update_handler(struct ctl_table *table, int write,
 208                 void __user *buffer, size_t *lenp,
 209                 loff_t *ppos)
 210 {
 211         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 212
 213         if (ret || !write)
 214                 return ret;
 215
 216         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 217         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 218         update_perf_cpu_limits();
 219
 220         return 0;
 221 }
 222
 223 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 224
 225 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 226                                 void __user *buffer, size_t *lenp,
 227                                 loff_t *ppos)
 228 {
 229         int ret = proc_dointvec(table, write, buffer, lenp, ppos);
 230
 231         if (ret || !write)
 232                 return ret;
 233
 234         update_perf_cpu_limits();
 235
 236         return 0;
 237 }
 238
 239 /*
 240  * perf samples are done in some very critical code paths (NMIs).
 241  * If they take too much CPU time, the system can lock up and not
 242  * get any real work done.  This will drop the sample rate when
 243  * we detect that events are taking too long.
 244  */
 245 #define NR_ACCUMULATED_SAMPLES 128
 246 static DEFINE_PER_CPU(u64, running_sample_length);
 247
 248 static void perf_duration_warn(struct irq_work *w)
 249 {
 250         u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 251         u64 avg_local_sample_len;
 252         u64 local_samples_len;
 253
 254         local_samples_len = __this_cpu_read(running_sample_length);
 255         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 256
 257         printk_ratelimited(KERN_WARNING
 258                         "perf interrupt took too long (%lld > %lld), lowering "
 259                         "kernel.perf_event_max_sample_rate to %d\n",
 260                         avg_local_sample_len, allowed_ns >> 1,
 261                         sysctl_perf_event_sample_rate);
 262 }
 263
 264 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 265
 266 void perf_sample_event_took(u64 sample_len_ns)
 267 {
 268         u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 269         u64 avg_local_sample_len;
 270         u64 local_samples_len;
 271
 272         if (allowed_ns == 0)
 273                 return;
 274
 275         /* decay the counter by 1 average sample */
 276         local_samples_len = __this_cpu_read(running_sample_length);
 277         local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
 278         local_samples_len += sample_len_ns;
 279         __this_cpu_write(running_sample_length, local_samples_len);
 280
 281         /*
 282          * note: this will be biased artifically low until we have
 283          * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
 284          * from having to maintain a count.
 285          */
 286         avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
 287
 288         if (avg_local_sample_len <= allowed_ns)
 289                 return;
 290
 291         if (max_samples_per_tick <= 1)
 292                 return;
 293
 294         max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
 295         sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 296         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 297
 298         update_perf_cpu_limits();
 299
 300         if (!irq_work_queue(&perf_duration_work)) {
 301                 early_printk("perf interrupt took too long (%lld > %lld), lowering "
 302                              "kernel.perf_event_max_sample_rate to %d\n",
 303                              avg_local_sample_len, allowed_ns >> 1,
 304                              sysctl_perf_event_sample_rate);
 305         }
 306 }
 307
 308 static atomic64_t perf_event_id;
 309
 310 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 311                               enum event_type_t event_type);
 312
 313 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 314                              enum event_type_t event_type,
 315                              struct task_struct *task);
 316
 317 static void update_context_time(struct perf_event_context *ctx);
 318 static u64 perf_event_time(struct perf_event *event);
 319
 320 void __weak perf_event_print_debug(void)        { }
 321
 322 extern __weak const char *perf_pmu_name(void)
 323 {
 324         return "pmu";
 325 }
 326
 327 static inline u64 perf_clock(void)
 328 {
 329         return local_clock();
 330 }
 331
 332 static inline u64 perf_event_clock(struct perf_event *event)
 333 {
 334         return event->clock();
 335 }
 336
 337 static inline struct perf_cpu_context *
 338 __get_cpu_context(struct perf_event_context *ctx)
 339 {
 340         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 341 }
 342
 343 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 344                           struct perf_event_context *ctx)
 345 {
 346         raw_spin_lock(&cpuctx->ctx.lock);
 347         if (ctx)
 348                 raw_spin_lock(&ctx->lock);
 349 }
 350
 351 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 352                             struct perf_event_context *ctx)
 353 {
 354         if (ctx)
 355                 raw_spin_unlock(&ctx->lock);
 356         raw_spin_unlock(&cpuctx->ctx.lock);
 357 }
 358
 359 #ifdef CONFIG_CGROUP_PERF
 360
 361 static inline bool
 362 perf_cgroup_match(struct perf_event *event)
 363 {
 364         struct perf_event_context *ctx = event->ctx;
 365         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 366
 367         /* @event doesn't care about cgroup */
 368         if (!event->cgrp)
 369                 return true;
 370
 371         /* wants specific cgroup scope but @cpuctx isn't associated with any */
 372         if (!cpuctx->cgrp)
 373                 return false;
 374
 375         /*
 376          * Cgroup scoping is recursive.  An event enabled for a cgroup is
 377          * also enabled for all its descendant cgroups.  If @cpuctx's
 378          * cgroup is a descendant of @event's (the test covers identity
 379          * case), it's a match.
 380          */
 381         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 382                                     event->cgrp->css.cgroup);
 383 }
 384
 385 static inline void perf_detach_cgroup(struct perf_event *event)
 386 {
 387         css_put(&event->cgrp->css);
 388         event->cgrp = NULL;
 389 }
 390
 391 static inline int is_cgroup_event(struct perf_event *event)
 392 {
 393         return event->cgrp != NULL;
 394 }
 395
 396 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 397 {
 398         struct perf_cgroup_info *t;
 399
 400         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 401         return t->time;
 402 }
 403
 404 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 405 {
 406         struct perf_cgroup_info *info;
 407         u64 now;
 408
 409         now = perf_clock();
 410
 411         info = this_cpu_ptr(cgrp->info);
 412
 413         info->time += now - info->timestamp;
 414         info->timestamp = now;
 415 }
 416
 417 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 418 {
 419         struct perf_cgroup *cgrp_out = cpuctx->cgrp;
 420         if (cgrp_out)
 421                 __update_cgrp_time(cgrp_out);
 422 }
 423
 424 static inline void update_cgrp_time_from_event(struct perf_event *event)
 425 {
 426         struct perf_cgroup *cgrp;
 427
 428         /*
 429          * ensure we access cgroup data only when needed and
 430          * when we know the cgroup is pinned (css_get)
 431          */
 432         if (!is_cgroup_event(event))
 433                 return;
 434
 435         cgrp = perf_cgroup_from_task(current);
 436         /*
 437          * Do not update time when cgroup is not active
 438          */
 439         if (cgrp == event->cgrp)
 440                 __update_cgrp_time(event->cgrp);
 441 }
 442
 443 static inline void
 444 perf_cgroup_set_timestamp(struct task_struct *task,
 445                           struct perf_event_context *ctx)
 446 {
 447         struct perf_cgroup *cgrp;
 448         struct perf_cgroup_info *info;
 449
 450         /*
 451          * ctx->lock held by caller
 452          * ensure we do not access cgroup data
 453          * unless we have the cgroup pinned (css_get)
 454          */
 455         if (!task || !ctx->nr_cgroups)
 456                 return;
 457
 458         cgrp = perf_cgroup_from_task(task);
 459         info = this_cpu_ptr(cgrp->info);
 460         info->timestamp = ctx->timestamp;
 461 }
 462
 463 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 464 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 465
 466 /*
 467  * reschedule events based on the cgroup constraint of task.
 468  *
 469  * mode SWOUT : schedule out everything
 470  * mode SWIN : schedule in based on cgroup for next
 471  */
 472 void perf_cgroup_switch(struct task_struct *task, int mode)
 473 {
 474         struct perf_cpu_context *cpuctx;
 475         struct pmu *pmu;
 476         unsigned long flags;
 477
 478         /*
 479          * disable interrupts to avoid geting nr_cgroup
 480          * changes via __perf_event_disable(). Also
 481          * avoids preemption.
 482          */
 483         local_irq_save(flags);
 484
 485         /*
 486          * we reschedule only in the presence of cgroup
 487          * constrained events.
 488          */
 489         rcu_read_lock();
 490
 491         list_for_each_entry_rcu(pmu, &pmus, entry) {
 492                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 493                 if (cpuctx->unique_pmu != pmu)
 494                         continue; /* ensure we process each cpuctx once */
 495
 496                 /*
 497                  * perf_cgroup_events says at least one
 498                  * context on this CPU has cgroup events.
 499                  *
 500                  * ctx->nr_cgroups reports the number of cgroup
 501                  * events for a context.
 502                  */
 503                 if (cpuctx->ctx.nr_cgroups > 0) {
 504                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 505                         perf_pmu_disable(cpuctx->ctx.pmu);
 506
 507                         if (mode & PERF_CGROUP_SWOUT) {
 508                                 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 509                                 /*
 510                                  * must not be done before ctxswout due
 511                                  * to event_filter_match() in event_sched_out()
 512                                  */
 513                                 cpuctx->cgrp = NULL;
 514                         }
 515
 516                         if (mode & PERF_CGROUP_SWIN) {
 517                                 WARN_ON_ONCE(cpuctx->cgrp);
 518                                 /*
 519                                  * set cgrp before ctxsw in to allow
 520                                  * event_filter_match() to not have to pass
 521                                  * task around
 522                                  */
 523                                 cpuctx->cgrp = perf_cgroup_from_task(task);
 524                                 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 525                         }
 526                         perf_pmu_enable(cpuctx->ctx.pmu);
 527                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 528                 }
 529         }
 530
 531         rcu_read_unlock();
 532
 533         local_irq_restore(flags);
 534 }
 535
 536 static inline void perf_cgroup_sched_out(struct task_struct *task,
 537                                          struct task_struct *next)
 538 {
 539         struct perf_cgroup *cgrp1;
 540         struct perf_cgroup *cgrp2 = NULL;
 541
 542         /*
 543          * we come here when we know perf_cgroup_events > 0
 544          */
 545         cgrp1 = perf_cgroup_from_task(task);
 546
 547         /*
 548          * next is NULL when called from perf_event_enable_on_exec()
 549          * that will systematically cause a cgroup_switch()
 550          */
 551         if (next)
 552                 cgrp2 = perf_cgroup_from_task(next);
 553
 554         /*
 555          * only schedule out current cgroup events if we know
 556          * that we are switching to a different cgroup. Otherwise,
 557          * do no touch the cgroup events.
 558          */
 559         if (cgrp1 != cgrp2)
 560                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 561 }
 562
 563 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 564                                         struct task_struct *task)
 565 {
 566         struct perf_cgroup *cgrp1;
 567         struct perf_cgroup *cgrp2 = NULL;
 568
 569         /*
 570          * we come here when we know perf_cgroup_events > 0
 571          */
 572         cgrp1 = perf_cgroup_from_task(task);
 573
 574         /* prev can never be NULL */
 575         cgrp2 = perf_cgroup_from_task(prev);
 576
 577         /*
 578          * only need to schedule in cgroup events if we are changing
 579          * cgroup during ctxsw. Cgroup events were not scheduled
 580          * out of ctxsw out if that was not the case.
 581          */
 582         if (cgrp1 != cgrp2)
 583                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 584 }
 585
 586 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 587                                       struct perf_event_attr *attr,
 588                                       struct perf_event *group_leader)
 589 {
 590         struct perf_cgroup *cgrp;
 591         struct cgroup_subsys_state *css;
 592         struct fd f = fdget(fd);
 593         int ret = 0;
 594
 595         if (!f.file)
 596                 return -EBADF;
 597
 598         css = css_tryget_online_from_dir(f.file->f_path.dentry,
 599                                          &perf_event_cgrp_subsys);
 600         if (IS_ERR(css)) {
 601                 ret = PTR_ERR(css);
 602                 goto out;
 603         }
 604
 605         cgrp = container_of(css, struct perf_cgroup, css);
 606         event->cgrp = cgrp;
 607
 608         /*
 609          * all events in a group must monitor
 610          * the same cgroup because a task belongs
 611          * to only one perf cgroup at a time
 612          */
 613         if (group_leader && group_leader->cgrp != cgrp) {
 614                 perf_detach_cgroup(event);
 615                 ret = -EINVAL;
 616         }
 617 out:
 618         fdput(f);
 619         return ret;
 620 }
 621
 622 static inline void
 623 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 624 {
 625         struct perf_cgroup_info *t;
 626         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 627         event->shadow_ctx_time = now - t->timestamp;
 628 }
 629
 630 static inline void
 631 perf_cgroup_defer_enabled(struct perf_event *event)
 632 {
 633         /*
 634          * when the current task's perf cgroup does not match
 635          * the event's, we need to remember to call the
 636          * perf_mark_enable() function the first time a task with
 637          * a matching perf cgroup is scheduled in.
 638          */
 639         if (is_cgroup_event(event) && !perf_cgroup_match(event))
 640                 event->cgrp_defer_enabled = 1;
 641 }
 642
 643 static inline void
 644 perf_cgroup_mark_enabled(struct perf_event *event,
 645                          struct perf_event_context *ctx)
 646 {
 647         struct perf_event *sub;
 648         u64 tstamp = perf_event_time(event);
 649
 650         if (!event->cgrp_defer_enabled)
 651                 return;
 652
 653         event->cgrp_defer_enabled = 0;
 654
 655         event->tstamp_enabled = tstamp - event->total_time_enabled;
 656         list_for_each_entry(sub, &event->sibling_list, group_entry) {
 657                 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
 658                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 659                         sub->cgrp_defer_enabled = 0;
 660                 }
 661         }
 662 }
 663 #else /* !CONFIG_CGROUP_PERF */
 664
 665 static inline bool
 666 perf_cgroup_match(struct perf_event *event)
 667 {
 668         return true;
 669 }
 670
 671 static inline void perf_detach_cgroup(struct perf_event *event)
 672 {}
 673
 674 static inline int is_cgroup_event(struct perf_event *event)
 675 {
 676         return 0;
 677 }
 678
 679 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
 680 {
 681         return 0;
 682 }
 683
 684 static inline void update_cgrp_time_from_event(struct perf_event *event)
 685 {
 686 }
 687
 688 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 689 {
 690 }
 691
 692 static inline void perf_cgroup_sched_out(struct task_struct *task,
 693                                          struct task_struct *next)
 694 {
 695 }
 696
 697 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 698                                         struct task_struct *task)
 699 {
 700 }
 701
 702 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
 703                                       struct perf_event_attr *attr,
 704                                       struct perf_event *group_leader)
 705 {
 706         return -EINVAL;
 707 }
 708
 709 static inline void
 710 perf_cgroup_set_timestamp(struct task_struct *task,
 711                           struct perf_event_context *ctx)
 712 {
 713 }
 714
 715 void
 716 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
 717 {
 718 }
 719
 720 static inline void
 721 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 722 {
 723 }
 724
 725 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 726 {
 727         return 0;
 728 }
 729
 730 static inline void
 731 perf_cgroup_defer_enabled(struct perf_event *event)
 732 {
 733 }
 734
 735 static inline void
 736 perf_cgroup_mark_enabled(struct perf_event *event,
 737                          struct perf_event_context *ctx)
 738 {
 739 }
 740 #endif
 741
 742 /*
 743  * set default to be dependent on timer tick just
 744  * like original code
 745  */
 746 #define PERF_CPU_HRTIMER (1000 / HZ)
 747 /*
 748  * function must be called with interrupts disbled
 749  */
 750 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
 751 {
 752         struct perf_cpu_context *cpuctx;
 753         enum hrtimer_restart ret = HRTIMER_NORESTART;
 754         int rotations = 0;
 755
 756         WARN_ON(!irqs_disabled());
 757
 758         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 759
 760         rotations = perf_rotate_context(cpuctx);
 761
 762         /*
 763          * arm timer if needed
 764          */
 765         if (rotations) {
 766                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
 767                 ret = HRTIMER_RESTART;
 768         }
 769
 770         return ret;
 771 }
 772
 773 /* CPU is going down */
 774 void perf_cpu_hrtimer_cancel(int cpu)
 775 {
 776         struct perf_cpu_context *cpuctx;
 777         struct pmu *pmu;
 778         unsigned long flags;
 779
 780         if (WARN_ON(cpu != smp_processor_id()))
 781                 return;
 782
 783         local_irq_save(flags);
 784
 785         rcu_read_lock();
 786
 787         list_for_each_entry_rcu(pmu, &pmus, entry) {
 788                 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 789
 790                 if (pmu->task_ctx_nr == perf_sw_context)
 791                         continue;
 792
 793                 hrtimer_cancel(&cpuctx->hrtimer);
 794         }
 795
 796         rcu_read_unlock();
 797
 798         local_irq_restore(flags);
 799 }
 800
 801 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 802 {
 803         struct hrtimer *hr = &cpuctx->hrtimer;
 804         struct pmu *pmu = cpuctx->ctx.pmu;
 805         int timer;
 806
 807         /* no multiplexing needed for SW PMU */
 808         if (pmu->task_ctx_nr == perf_sw_context)
 809                 return;
 810
 811         /*
 812          * check default is sane, if not set then force to
 813          * default interval (1/tick)
 814          */
 815         timer = pmu->hrtimer_interval_ms;
 816         if (timer < 1)
 817                 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 818
 819         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 820
 821         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 822         hr->function = perf_cpu_hrtimer_handler;
 823 }
 824
 825 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
 826 {
 827         struct hrtimer *hr = &cpuctx->hrtimer;
 828         struct pmu *pmu = cpuctx->ctx.pmu;
 829
 830         /* not for SW PMU */
 831         if (pmu->task_ctx_nr == perf_sw_context)
 832                 return;
 833
 834         if (hrtimer_active(hr))
 835                 return;
 836
 837         hrtimer_start(hr, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
 838 }
 839
 840 void perf_pmu_disable(struct pmu *pmu)
 841 {
 842         int *count = this_cpu_ptr(pmu->pmu_disable_count);
 843         if (!(*count)++)
 844                 pmu->pmu_disable(pmu);
 845 }
 846
 847 void perf_pmu_enable(struct pmu *pmu)
 848 {
 849         int *count = this_cpu_ptr(pmu->pmu_disable_count);
 850         if (!--(*count))
 851                 pmu->pmu_enable(pmu);
 852 }
 853
 854 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 855
 856 /*
 857  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
 858  * perf_event_task_tick() are fully serialized because they're strictly cpu
 859  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
 860  * disabled, while perf_event_task_tick is called from IRQ context.
 861  */
 862 static void perf_event_ctx_activate(struct perf_event_context *ctx)
 863 {
 864         struct list_head *head = this_cpu_ptr(&active_ctx_list);
 865
 866         WARN_ON(!irqs_disabled());
 867
 868         WARN_ON(!list_empty(&ctx->active_ctx_list));
 869
 870         list_add(&ctx->active_ctx_list, head);
 871 }
 872
 873 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
 874 {
 875         WARN_ON(!irqs_disabled());
 876
 877         WARN_ON(list_empty(&ctx->active_ctx_list));
 878
 879         list_del_init(&ctx->active_ctx_list);
 880 }
 881
 882 static void get_ctx(struct perf_event_context *ctx)
 883 {
 884         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 885 }
 886
 887 static void free_ctx(struct rcu_head *head)
 888 {
 889         struct perf_event_context *ctx;
 890
 891         ctx = container_of(head, struct perf_event_context, rcu_head);
 892         kfree(ctx->task_ctx_data);
 893         kfree(ctx);
 894 }
 895
 896 static void put_ctx(struct perf_event_context *ctx)
 897 {
 898         if (atomic_dec_and_test(&ctx->refcount)) {
 899                 if (ctx->parent_ctx)
 900                         put_ctx(ctx->parent_ctx);
 901                 if (ctx->task)
 902                         put_task_struct(ctx->task);
 903                 call_rcu(&ctx->rcu_head, free_ctx);
 904         }
 905 }
 906
 907 /*
 908  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 909  * perf_pmu_migrate_context() we need some magic.
 910  *
 911  * Those places that change perf_event::ctx will hold both
 912  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 913  *
 914  * Lock ordering is by mutex address. There is one other site where
 915  * perf_event_context::mutex nests and that is put_event(). But remember that
 916  * that is a parent<->child context relation, and migration does not affect
 917  * children, therefore these two orderings should not interact.
 918  *
 919  * The change in perf_event::ctx does not affect children (as claimed above)
 920  * because the sys_perf_event_open() case will install a new event and break
 921  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 922  * concerned with cpuctx and that doesn't have children.
 923  *
 924  * The places that change perf_event::ctx will issue:
 925  *
 926  *   perf_remove_from_context();
 927  *   synchronize_rcu();
 928  *   perf_install_in_context();
 929  *
 930  * to affect the change. The remove_from_context() + synchronize_rcu() should
 931  * quiesce the event, after which we can install it in the new location. This
 932  * means that only external vectors (perf_fops, prctl) can perturb the event
 933  * while in transit. Therefore all such accessors should also acquire
 934  * perf_event_context::mutex to serialize against this.
 935  *
 936  * However; because event->ctx can change while we're waiting to acquire
 937  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 938  * function.
 939  *
 940  * Lock order:
 941  *      task_struct::perf_event_mutex
 942  *        perf_event_context::mutex
 943  *          perf_event_context::lock
 944  *          perf_event::child_mutex;
 945  *          perf_event::mmap_mutex
 946  *          mmap_sem
 947  */
 948 static struct perf_event_context *
 949 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
 950 {
 951         struct perf_event_context *ctx;
 952
 953 again:
 954         rcu_read_lock();
 955         ctx = ACCESS_ONCE(event->ctx);
 956         if (!atomic_inc_not_zero(&ctx->refcount)) {
 957                 rcu_read_unlock();
 958                 goto again;
 959         }
 960         rcu_read_unlock();
 961
 962         mutex_lock_nested(&ctx->mutex, nesting);
 963         if (event->ctx != ctx) {
 964                 mutex_unlock(&ctx->mutex);
 965                 put_ctx(ctx);
 966                 goto again;
 967         }
 968
 969         return ctx;
 970 }
 971
 972 static inline struct perf_event_context *
 973 perf_event_ctx_lock(struct perf_event *event)
 974 {
 975         return perf_event_ctx_lock_nested(event, 0);
 976 }
 977
 978 static void perf_event_ctx_unlock(struct perf_event *event,
 979                                   struct perf_event_context *ctx)
 980 {
 981         mutex_unlock(&ctx->mutex);
 982         put_ctx(ctx);
 983 }
 984
 985 /*
 986  * This must be done under the ctx->lock, such as to serialize against
 987  * context_equiv(), therefore we cannot call put_ctx() since that might end up
 988  * calling scheduler related locks and ctx->lock nests inside those.
 989  */
 990 static __must_check struct perf_event_context *
 991 unclone_ctx(struct perf_event_context *ctx)
 992 {
 993         struct perf_event_context *parent_ctx = ctx->parent_ctx;
 994
 995         lockdep_assert_held(&ctx->lock);
 996
 997         if (parent_ctx)
 998                 ctx->parent_ctx = NULL;
 999         ctx->generation++;
1000
1001         return parent_ctx;
1002 }
1003
1004 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1005 {
1006         /*
1007          * only top level events have the pid namespace they were created in
1008          */
1009         if (event->parent)
1010                 event = event->parent;
1011
1012         return task_tgid_nr_ns(p, event->ns);
1013 }
1014
1015 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1016 {
1017         /*
1018          * only top level events have the pid namespace they were created in
1019          */
1020         if (event->parent)
1021                 event = event->parent;
1022
1023         return task_pid_nr_ns(p, event->ns);
1024 }
1025
1026 /*
1027  * If we inherit events we want to return the parent event id
1028  * to userspace.
1029  */
1030 static u64 primary_event_id(struct perf_event *event)
1031 {
1032         u64 id = event->id;
1033
1034         if (event->parent)
1035                 id = event->parent->id;
1036
1037         return id;
1038 }
1039
1040 /*
1041  * Get the perf_event_context for a task and lock it.
1042  * This has to cope with with the fact that until it is locked,
1043  * the context could get moved to another task.
1044  */
1045 static struct perf_event_context *
1046 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1047 {
1048         struct perf_event_context *ctx;
1049
1050 retry:
1051         /*
1052          * One of the few rules of preemptible RCU is that one cannot do
1053          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1054          * part of the read side critical section was preemptible -- see
1055          * rcu_read_unlock_special().
1056          *
1057          * Since ctx->lock nests under rq->lock we must ensure the entire read
1058          * side critical section is non-preemptible.
1059          */
1060         preempt_disable();
1061         rcu_read_lock();
1062         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1063         if (ctx) {
1064                 /*
1065                  * If this context is a clone of another, it might
1066                  * get swapped for another underneath us by
1067                  * perf_event_task_sched_out, though the
1068                  * rcu_read_lock() protects us from any context
1069                  * getting freed.  Lock the context and check if it
1070                  * got swapped before we could get the lock, and retry
1071                  * if so.  If we locked the right context, then it
1072                  * can't get swapped on us any more.
1073                  */
1074                 raw_spin_lock_irqsave(&ctx->lock, *flags);
1075                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1076                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1077                         rcu_read_unlock();
1078                         preempt_enable();
1079                         goto retry;
1080                 }
1081
1082                 if (!atomic_inc_not_zero(&ctx->refcount)) {
1083                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1084                         ctx = NULL;
1085                 }
1086         }
1087         rcu_read_unlock();
1088         preempt_enable();
1089         return ctx;
1090 }
1091
1092 /*
1093  * Get the context for a task and increment its pin_count so it
1094  * can't get swapped to another task.  This also increments its
1095  * reference count so that the context can't get freed.
1096  */
1097 static struct perf_event_context *
1098 perf_pin_task_context(struct task_struct *task, int ctxn)
1099 {
1100         struct perf_event_context *ctx;
1101         unsigned long flags;
1102
1103         ctx = perf_lock_task_context(task, ctxn, &flags);
1104         if (ctx) {
1105                 ++ctx->pin_count;
1106                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1107         }
1108         return ctx;
1109 }
1110
1111 static void perf_unpin_context(struct perf_event_context *ctx)
1112 {
1113         unsigned long flags;
1114
1115         raw_spin_lock_irqsave(&ctx->lock, flags);
1116         --ctx->pin_count;
1117         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1118 }
1119
1120 /*
1121  * Update the record of the current time in a context.
1122  */
1123 static void update_context_time(struct perf_event_context *ctx)
1124 {
1125         u64 now = perf_clock();
1126
1127         ctx->time += now - ctx->timestamp;
1128         ctx->timestamp = now;
1129 }
1130
1131 static u64 perf_event_time(struct perf_event *event)
1132 {
1133         struct perf_event_context *ctx = event->ctx;
1134
1135         if (is_cgroup_event(event))
1136                 return perf_cgroup_event_time(event);
1137
1138         return ctx ? ctx->time : 0;
1139 }
1140
1141 /*
1142  * Update the total_time_enabled and total_time_running fields for a event.
1143  * The caller of this function needs to hold the ctx->lock.
1144  */
1145 static void update_event_times(struct perf_event *event)
1146 {
1147         struct perf_event_context *ctx = event->ctx;
1148         u64 run_end;
1149
1150         if (event->state < PERF_EVENT_STATE_INACTIVE ||
1151             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1152                 return;
1153         /*
1154          * in cgroup mode, time_enabled represents
1155          * the time the event was enabled AND active
1156          * tasks were in the monitored cgroup. This is
1157          * independent of the activity of the context as
1158          * there may be a mix of cgroup and non-cgroup events.
1159          *
1160          * That is why we treat cgroup events differently
1161          * here.
1162          */
1163         if (is_cgroup_event(event))
1164                 run_end = perf_cgroup_event_time(event);
1165         else if (ctx->is_active)
1166                 run_end = ctx->time;
1167         else
1168                 run_end = event->tstamp_stopped;
1169
1170         event->total_time_enabled = run_end - event->tstamp_enabled;
1171
1172         if (event->state == PERF_EVENT_STATE_INACTIVE)
1173                 run_end = event->tstamp_stopped;
1174         else
1175                 run_end = perf_event_time(event);
1176
1177         event->total_time_running = run_end - event->tstamp_running;
1178
1179 }
1180
1181 /*
1182  * Update total_time_enabled and total_time_running for all events in a group.
1183  */
1184 static void update_group_times(struct perf_event *leader)
1185 {
1186         struct perf_event *event;
1187
1188         update_event_times(leader);
1189         list_for_each_entry(event, &leader->sibling_list, group_entry)
1190                 update_event_times(event);
1191 }
1192
1193 static struct list_head *
1194 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1195 {
1196         if (event->attr.pinned)
1197                 return &ctx->pinned_groups;
1198         else
1199                 return &ctx->flexible_groups;
1200 }
1201
1202 /*
1203  * Add a event from the lists for its context.
1204  * Must be called with ctx->mutex and ctx->lock held.
1205  */
1206 static void
1207 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1208 {
1209         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1210         event->attach_state |= PERF_ATTACH_CONTEXT;
1211
1212         /*
1213          * If we're a stand alone event or group leader, we go to the context
1214          * list, group events are kept attached to the group so that
1215          * perf_group_detach can, at all times, locate all siblings.
1216          */
1217         if (event->group_leader == event) {
1218                 struct list_head *list;
1219
1220                 if (is_software_event(event))
1221                         event->group_flags |= PERF_GROUP_SOFTWARE;
1222
1223                 list = ctx_group_list(event, ctx);
1224                 list_add_tail(&event->group_entry, list);
1225         }
1226
1227         if (is_cgroup_event(event))
1228                 ctx->nr_cgroups++;
1229
1230         list_add_rcu(&event->event_entry, &ctx->event_list);
1231         ctx->nr_events++;
1232         if (event->attr.inherit_stat)
1233                 ctx->nr_stat++;
1234
1235         ctx->generation++;
1236 }
1237
1238 /*
1239  * Initialize event state based on the perf_event_attr::disabled.
1240  */
1241 static inline void perf_event__state_init(struct perf_event *event)
1242 {
1243         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1244                                               PERF_EVENT_STATE_INACTIVE;
1245 }
1246
1247 /*
1248  * Called at perf_event creation and when events are attached/detached from a
1249  * group.
1250  */
1251 static void perf_event__read_size(struct perf_event *event)
1252 {
1253         int entry = sizeof(u64); /* value */
1254         int size = 0;
1255         int nr = 1;
1256
1257         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1258                 size += sizeof(u64);
1259
1260         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1261                 size += sizeof(u64);
1262
1263         if (event->attr.read_format & PERF_FORMAT_ID)
1264                 entry += sizeof(u64);
1265
1266         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1267                 nr += event->group_leader->nr_siblings;
1268                 size += sizeof(u64);
1269         }
1270
1271         size += entry * nr;
1272         event->read_size = size;
1273 }
1274
1275 static void perf_event__header_size(struct perf_event *event)
1276 {
1277         struct perf_sample_data *data;
1278         u64 sample_type = event->attr.sample_type;
1279         u16 size = 0;
1280
1281         perf_event__read_size(event);
1282
1283         if (sample_type & PERF_SAMPLE_IP)
1284                 size += sizeof(data->ip);
1285
1286         if (sample_type & PERF_SAMPLE_ADDR)
1287                 size += sizeof(data->addr);
1288
1289         if (sample_type & PERF_SAMPLE_PERIOD)
1290                 size += sizeof(data->period);
1291
1292         if (sample_type & PERF_SAMPLE_WEIGHT)
1293                 size += sizeof(data->weight);
1294
1295         if (sample_type & PERF_SAMPLE_READ)
1296                 size += event->read_size;
1297
1298         if (sample_type & PERF_SAMPLE_DATA_SRC)
1299                 size += sizeof(data->data_src.val);
1300
1301         if (sample_type & PERF_SAMPLE_TRANSACTION)
1302                 size += sizeof(data->txn);
1303
1304         event->header_size = size;
1305 }
1306
1307 static void perf_event__id_header_size(struct perf_event *event)
1308 {
1309         struct perf_sample_data *data;
1310         u64 sample_type = event->attr.sample_type;
1311         u16 size = 0;
1312
1313         if (sample_type & PERF_SAMPLE_TID)
1314                 size += sizeof(data->tid_entry);
1315
1316         if (sample_type & PERF_SAMPLE_TIME)
1317                 size += sizeof(data->time);
1318
1319         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1320                 size += sizeof(data->id);
1321
1322         if (sample_type & PERF_SAMPLE_ID)
1323                 size += sizeof(data->id);
1324
1325         if (sample_type & PERF_SAMPLE_STREAM_ID)
1326                 size += sizeof(data->stream_id);
1327
1328         if (sample_type & PERF_SAMPLE_CPU)
1329                 size += sizeof(data->cpu_entry);
1330
1331         event->id_header_size = size;
1332 }
1333
1334 static void perf_group_attach(struct perf_event *event)
1335 {
1336         struct perf_event *group_leader = event->group_leader, *pos;
1337
1338         /*
1339          * We can have double attach due to group movement in perf_event_open.
1340          */
1341         if (event->attach_state & PERF_ATTACH_GROUP)
1342                 return;
1343
1344         event->attach_state |= PERF_ATTACH_GROUP;
1345
1346         if (group_leader == event)
1347                 return;
1348
1349         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1350
1351         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1352                         !is_software_event(event))
1353                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1354
1355         list_add_tail(&event->group_entry, &group_leader->sibling_list);
1356         group_leader->nr_siblings++;
1357
1358         perf_event__header_size(group_leader);
1359
1360         list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1361                 perf_event__header_size(pos);
1362 }
1363
1364 /*
1365  * Remove a event from the lists for its context.
1366  * Must be called with ctx->mutex and ctx->lock held.
1367  */
1368 static void
1369 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1370 {
1371         struct perf_cpu_context *cpuctx;
1372
1373         WARN_ON_ONCE(event->ctx != ctx);
1374         lockdep_assert_held(&ctx->lock);
1375
1376         /*
1377          * We can have double detach due to exit/hot-unplug + close.
1378          */
1379         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1380                 return;
1381
1382         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1383
1384         if (is_cgroup_event(event)) {
1385                 ctx->nr_cgroups--;
1386                 cpuctx = __get_cpu_context(ctx);
1387                 /*
1388                  * if there are no more cgroup events
1389                  * then cler cgrp to avoid stale pointer
1390                  * in update_cgrp_time_from_cpuctx()
1391                  */
1392                 if (!ctx->nr_cgroups)
1393                         cpuctx->cgrp = NULL;
1394         }
1395
1396         ctx->nr_events--;
1397         if (event->attr.inherit_stat)
1398                 ctx->nr_stat--;
1399
1400         list_del_rcu(&event->event_entry);
1401
1402         if (event->group_leader == event)
1403                 list_del_init(&event->group_entry);
1404
1405         update_group_times(event);
1406
1407         /*
1408          * If event was in error state, then keep it
1409          * that way, otherwise bogus counts will be
1410          * returned on read(). The only way to get out
1411          * of error state is by explicit re-enabling
1412          * of the event
1413          */
1414         if (event->state > PERF_EVENT_STATE_OFF)
1415                 event->state = PERF_EVENT_STATE_OFF;
1416
1417         ctx->generation++;
1418 }
1419
1420 static void perf_group_detach(struct perf_event *event)
1421 {
1422         struct perf_event *sibling, *tmp;
1423         struct list_head *list = NULL;
1424
1425         /*
1426          * We can have double detach due to exit/hot-unplug + close.
1427          */
1428         if (!(event->attach_state & PERF_ATTACH_GROUP))
1429                 return;
1430
1431         event->attach_state &= ~PERF_ATTACH_GROUP;
1432
1433         /*
1434          * If this is a sibling, remove it from its group.
1435          */
1436         if (event->group_leader != event) {
1437                 list_del_init(&event->group_entry);
1438                 event->group_leader->nr_siblings--;
1439                 goto out;
1440         }
1441
1442         if (!list_empty(&event->group_entry))
1443                 list = &event->group_entry;
1444
1445         /*
1446          * If this was a group event with sibling events then
1447          * upgrade the siblings to singleton events by adding them
1448          * to whatever list we are on.
1449          */
1450         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1451                 if (list)
1452                         list_move_tail(&sibling->group_entry, list);
1453                 sibling->group_leader = sibling;
1454
1455                 /* Inherit group flags from the previous leader */
1456                 sibling->group_flags = event->group_flags;
1457
1458                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1459         }
1460
1461 out:
1462         perf_event__header_size(event->group_leader);
1463
1464         list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1465                 perf_event__header_size(tmp);
1466 }
1467
1468 /*
1469  * User event without the task.
1470  */
1471 static bool is_orphaned_event(struct perf_event *event)
1472 {
1473         return event && !is_kernel_event(event) && !event->owner;
1474 }
1475
1476 /*
1477  * Event has a parent but parent's task finished and it's
1478  * alive only because of children holding refference.
1479  */
1480 static bool is_orphaned_child(struct perf_event *event)
1481 {
1482         return is_orphaned_event(event->parent);
1483 }
1484
1485 static void orphans_remove_work(struct work_struct *work);
1486
1487 static void schedule_orphans_remove(struct perf_event_context *ctx)
1488 {
1489         if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1490                 return;
1491
1492         if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1493                 get_ctx(ctx);
1494                 ctx->orphans_remove_sched = true;
1495         }
1496 }
1497
1498 static int __init perf_workqueue_init(void)
1499 {
1500         perf_wq = create_singlethread_workqueue("perf");
1501         WARN(!perf_wq, "failed to create perf workqueue\n");
1502         return perf_wq ? 0 : -1;
1503 }
1504
1505 core_initcall(perf_workqueue_init);
1506
1507 static inline int
1508 event_filter_match(struct perf_event *event)
1509 {
1510         return (event->cpu == -1 || event->cpu == smp_processor_id())
1511             && perf_cgroup_match(event);
1512 }
1513
1514 static void
1515 event_sched_out(struct perf_event *event,
1516                   struct perf_cpu_context *cpuctx,
1517                   struct perf_event_context *ctx)
1518 {
1519         u64 tstamp = perf_event_time(event);
1520         u64 delta;
1521
1522         WARN_ON_ONCE(event->ctx != ctx);
1523         lockdep_assert_held(&ctx->lock);
1524
1525         /*
1526          * An event which could not be activated because of
1527          * filter mismatch still needs to have its timings
1528          * maintained, otherwise bogus information is return
1529          * via read() for time_enabled, time_running:
1530          */
1531         if (event->state == PERF_EVENT_STATE_INACTIVE
1532             && !event_filter_match(event)) {
1533                 delta = tstamp - event->tstamp_stopped;
1534                 event->tstamp_running += delta;
1535                 event->tstamp_stopped = tstamp;
1536         }
1537
1538         if (event->state != PERF_EVENT_STATE_ACTIVE)
1539                 return;
1540
1541         perf_pmu_disable(event->pmu);
1542
1543         event->state = PERF_EVENT_STATE_INACTIVE;
1544         if (event->pending_disable) {
1545                 event->pending_disable = 0;
1546                 event->state = PERF_EVENT_STATE_OFF;
1547         }
1548         event->tstamp_stopped = tstamp;
1549         event->pmu->del(event, 0);
1550         event->oncpu = -1;
1551
1552         if (!is_software_event(event))
1553                 cpuctx->active_oncpu--;
1554         if (!--ctx->nr_active)
1555                 perf_event_ctx_deactivate(ctx);
1556         if (event->attr.freq && event->attr.sample_freq)
1557                 ctx->nr_freq--;
1558         if (event->attr.exclusive || !cpuctx->active_oncpu)
1559                 cpuctx->exclusive = 0;
1560
1561         if (is_orphaned_child(event))
1562                 schedule_orphans_remove(ctx);
1563
1564         perf_pmu_enable(event->pmu);
1565 }
1566
1567 static void
1568 group_sched_out(struct perf_event *group_event,
1569                 struct perf_cpu_context *cpuctx,
1570                 struct perf_event_context *ctx)
1571 {
1572         struct perf_event *event;
1573         int state = group_event->state;
1574
1575         event_sched_out(group_event, cpuctx, ctx);
1576
1577         /*
1578          * Schedule out siblings (if any):
1579          */
1580         list_for_each_entry(event, &group_event->sibling_list, group_entry)
1581                 event_sched_out(event, cpuctx, ctx);
1582
1583         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1584                 cpuctx->exclusive = 0;
1585 }
1586
1587 struct remove_event {
1588         struct perf_event *event;
1589         bool detach_group;
1590 };
1591
1592 /*
1593  * Cross CPU call to remove a performance event
1594  *
1595  * We disable the event on the hardware level first. After that we
1596  * remove it from the context list.
1597  */
1598 static int __perf_remove_from_context(void *info)
1599 {
1600         struct remove_event *re = info;
1601         struct perf_event *event = re->event;
1602         struct perf_event_context *ctx = event->ctx;
1603         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1604
1605         raw_spin_lock(&ctx->lock);
1606         event_sched_out(event, cpuctx, ctx);
1607         if (re->detach_group)
1608                 perf_group_detach(event);
1609         list_del_event(event, ctx);
1610         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1611                 ctx->is_active = 0;
1612                 cpuctx->task_ctx = NULL;
1613         }
1614         raw_spin_unlock(&ctx->lock);
1615
1616         return 0;
1617 }
1618
1619
1620 /*
1621  * Remove the event from a task's (or a CPU's) list of events.
1622  *
1623  * CPU events are removed with a smp call. For task events we only
1624  * call when the task is on a CPU.
1625  *
1626  * If event->ctx is a cloned context, callers must make sure that
1627  * every task struct that event->ctx->task could possibly point to
1628  * remains valid.  This is OK when called from perf_release since
1629  * that only calls us on the top-level context, which can't be a clone.
1630  * When called from perf_event_exit_task, it's OK because the
1631  * context has been detached from its task.
1632  */
1633 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1634 {
1635         struct perf_event_context *ctx = event->ctx;
1636         struct task_struct *task = ctx->task;
1637         struct remove_event re = {
1638                 .event = event,
1639                 .detach_group = detach_group,
1640         };
1641
1642         lockdep_assert_held(&ctx->mutex);
1643
1644         if (!task) {
1645                 /*
1646                  * Per cpu events are removed via an smp call. The removal can
1647                  * fail if the CPU is currently offline, but in that case we
1648                  * already called __perf_remove_from_context from
1649                  * perf_event_exit_cpu.
1650                  */
1651                 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1652                 return;
1653         }
1654
1655 retry:
1656         if (!task_function_call(task, __perf_remove_from_context, &re))
1657                 return;
1658
1659         raw_spin_lock_irq(&ctx->lock);
1660         /*
1661          * If we failed to find a running task, but find the context active now
1662          * that we've acquired the ctx->lock, retry.
1663          */
1664         if (ctx->is_active) {
1665                 raw_spin_unlock_irq(&ctx->lock);
1666                 /*
1667                  * Reload the task pointer, it might have been changed by
1668                  * a concurrent perf_event_context_sched_out().
1669                  */
1670                 task = ctx->task;
1671                 goto retry;
1672         }
1673
1674         /*
1675          * Since the task isn't running, its safe to remove the event, us
1676          * holding the ctx->lock ensures the task won't get scheduled in.
1677          */
1678         if (detach_group)
1679                 perf_group_detach(event);
1680         list_del_event(event, ctx);
1681         raw_spin_unlock_irq(&ctx->lock);
1682 }
1683
1684 /*
1685  * Cross CPU call to disable a performance event
1686  */
1687 int __perf_event_disable(void *info)
1688 {
1689         struct perf_event *event = info;
1690         struct perf_event_context *ctx = event->ctx;
1691         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1692
1693         /*
1694          * If this is a per-task event, need to check whether this
1695          * event's task is the current task on this cpu.
1696          *
1697          * Can trigger due to concurrent perf_event_context_sched_out()
1698          * flipping contexts around.
1699          */
1700         if (ctx->task && cpuctx->task_ctx != ctx)
1701                 return -EINVAL;
1702
1703         raw_spin_lock(&ctx->lock);
1704
1705         /*
1706          * If the event is on, turn it off.
1707          * If it is in error state, leave it in error state.
1708          */
1709         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1710                 update_context_time(ctx);
1711                 update_cgrp_time_from_event(event);
1712                 update_group_times(event);
1713                 if (event == event->group_leader)
1714                         group_sched_out(event, cpuctx, ctx);
1715                 else
1716                         event_sched_out(event, cpuctx, ctx);
1717                 event->state = PERF_EVENT_STATE_OFF;
1718         }
1719
1720         raw_spin_unlock(&ctx->lock);
1721
1722         return 0;
1723 }
1724
1725 /*
1726  * Disable a event.
1727  *
1728  * If event->ctx is a cloned context, callers must make sure that
1729  * every task struct that event->ctx->task could possibly point to
1730  * remains valid.  This condition is satisifed when called through
1731  * perf_event_for_each_child or perf_event_for_each because they
1732  * hold the top-level event's child_mutex, so any descendant that
1733  * goes to exit will block in sync_child_event.
1734  * When called from perf_pending_event it's OK because event->ctx
1735  * is the current context on this CPU and preemption is disabled,
1736  * hence we can't get into perf_event_task_sched_out for this context.
1737  */
1738 static void _perf_event_disable(struct perf_event *event)
1739 {
1740         struct perf_event_context *ctx = event->ctx;
1741         struct task_struct *task = ctx->task;
1742
1743         if (!task) {
1744                 /*
1745                  * Disable the event on the cpu that it's on
1746                  */
1747                 cpu_function_call(event->cpu, __perf_event_disable, event);
1748                 return;
1749         }
1750
1751 retry:
1752         if (!task_function_call(task, __perf_event_disable, event))
1753                 return;
1754
1755         raw_spin_lock_irq(&ctx->lock);
1756         /*
1757          * If the event is still active, we need to retry the cross-call.
1758          */
1759         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1760                 raw_spin_unlock_irq(&ctx->lock);
1761                 /*
1762                  * Reload the task pointer, it might have been changed by
1763                  * a concurrent perf_event_context_sched_out().
1764                  */
1765                 task = ctx->task;
1766                 goto retry;
1767         }
1768
1769         /*
1770          * Since we have the lock this context can't be scheduled
1771          * in, so we can change the state safely.
1772          */
1773         if (event->state == PERF_EVENT_STATE_INACTIVE) {
1774                 update_group_times(event);
1775                 event->state = PERF_EVENT_STATE_OFF;
1776         }
1777         raw_spin_unlock_irq(&ctx->lock);
1778 }
1779
1780 /*
1781  * Strictly speaking kernel users cannot create groups and therefore this
1782  * interface does not need the perf_event_ctx_lock() magic.
1783  */
1784 void perf_event_disable(struct perf_event *event)
1785 {
1786         struct perf_event_context *ctx;
1787
1788         ctx = perf_event_ctx_lock(event);
1789         _perf_event_disable(event);
1790         perf_event_ctx_unlock(event, ctx);
1791 }
1792 EXPORT_SYMBOL_GPL(perf_event_disable);
1793
1794 static void perf_set_shadow_time(struct perf_event *event,
1795                                  struct perf_event_context *ctx,
1796                                  u64 tstamp)
1797 {
1798         /*
1799          * use the correct time source for the time snapshot
1800          *
1801          * We could get by without this by leveraging the
1802          * fact that to get to this function, the caller
1803          * has most likely already called update_context_time()
1804          * and update_cgrp_time_xx() and thus both timestamp
1805          * are identical (or very close). Given that tstamp is,
1806          * already adjusted for cgroup, we could say that:
1807          *    tstamp - ctx->timestamp
1808          * is equivalent to
1809          *    tstamp - cgrp->timestamp.
1810          *
1811          * Then, in perf_output_read(), the calculation would
1812          * work with no changes because:
1813          * - event is guaranteed scheduled in
1814          * - no scheduled out in between
1815          * - thus the timestamp would be the same
1816          *
1817          * But this is a bit hairy.
1818          *
1819          * So instead, we have an explicit cgroup call to remain
1820          * within the time time source all along. We believe it
1821          * is cleaner and simpler to understand.
1822          */
1823         if (is_cgroup_event(event))
1824                 perf_cgroup_set_shadow_time(event, tstamp);
1825         else
1826                 event->shadow_ctx_time = tstamp - ctx->timestamp;
1827 }
1828
1829 #define MAX_INTERRUPTS (~0ULL)
1830
1831 static void perf_log_throttle(struct perf_event *event, int enable);
1832 static void perf_log_itrace_start(struct perf_event *event);
1833
1834 static int
1835 event_sched_in(struct perf_event *event,
1836                  struct perf_cpu_context *cpuctx,
1837                  struct perf_event_context *ctx)
1838 {
1839         u64 tstamp = perf_event_time(event);
1840         int ret = 0;
1841
1842         lockdep_assert_held(&ctx->lock);
1843
1844         if (event->state <= PERF_EVENT_STATE_OFF)
1845                 return 0;
1846
1847         event->state = PERF_EVENT_STATE_ACTIVE;
1848         event->oncpu = smp_processor_id();
1849
1850         /*
1851          * Unthrottle events, since we scheduled we might have missed several
1852          * ticks already, also for a heavily scheduling task there is little
1853          * guarantee it'll get a tick in a timely manner.
1854          */
1855         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1856                 perf_log_throttle(event, 1);
1857                 event->hw.interrupts = 0;
1858         }
1859
1860         /*
1861          * The new state must be visible before we turn it on in the hardware:
1862          */
1863         smp_wmb();
1864
1865         perf_pmu_disable(event->pmu);
1866
1867         event->tstamp_running += tstamp - event->tstamp_stopped;
1868
1869         perf_set_shadow_time(event, ctx, tstamp);
1870
1871         perf_log_itrace_start(event);
1872
1873         if (event->pmu->add(event, PERF_EF_START)) {
1874                 event->state = PERF_EVENT_STATE_INACTIVE;
1875                 event->oncpu = -1;
1876                 ret = -EAGAIN;
1877                 goto out;
1878         }
1879
1880         if (!is_software_event(event))
1881                 cpuctx->active_oncpu++;
1882         if (!ctx->nr_active++)
1883                 perf_event_ctx_activate(ctx);
1884         if (event->attr.freq && event->attr.sample_freq)
1885                 ctx->nr_freq++;
1886
1887         if (event->attr.exclusive)
1888                 cpuctx->exclusive = 1;
1889
1890         if (is_orphaned_child(event))
1891                 schedule_orphans_remove(ctx);
1892
1893 out:
1894         perf_pmu_enable(event->pmu);
1895
1896         return ret;
1897 }
1898
1899 static int
1900 group_sched_in(struct perf_event *group_event,
1901                struct perf_cpu_context *cpuctx,
1902                struct perf_event_context *ctx)
1903 {
1904         struct perf_event *event, *partial_group = NULL;
1905         struct pmu *pmu = ctx->pmu;
1906         u64 now = ctx->time;
1907         bool simulate = false;
1908
1909         if (group_event->state == PERF_EVENT_STATE_OFF)
1910                 return 0;
1911
1912         pmu->start_txn(pmu);
1913
1914         if (event_sched_in(group_event, cpuctx, ctx)) {
1915                 pmu->cancel_txn(pmu);
1916                 perf_cpu_hrtimer_restart(cpuctx);
1917                 return -EAGAIN;
1918         }
1919
1920         /*
1921          * Schedule in siblings as one group (if any):
1922          */
1923         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1924                 if (event_sched_in(event, cpuctx, ctx)) {
1925                         partial_group = event;
1926                         goto group_error;
1927                 }
1928         }
1929
1930         if (!pmu->commit_txn(pmu))
1931                 return 0;
1932
1933 group_error:
1934         /*
1935          * Groups can be scheduled in as one unit only, so undo any
1936          * partial group before returning:
1937          * The events up to the failed event are scheduled out normally,
1938          * tstamp_stopped will be updated.
1939          *
1940          * The failed events and the remaining siblings need to have
1941          * their timings updated as if they had gone thru event_sched_in()
1942          * and event_sched_out(). This is required to get consistent timings
1943          * across the group. This also takes care of the case where the group
1944          * could never be scheduled by ensuring tstamp_stopped is set to mark
1945          * the time the event was actually stopped, such that time delta
1946          * calculation in update_event_times() is correct.
1947          */
1948         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1949                 if (event == partial_group)
1950                         simulate = true;
1951
1952                 if (simulate) {
1953                         event->tstamp_running += now - event->tstamp_stopped;
1954                         event->tstamp_stopped = now;
1955                 } else {
1956                         event_sched_out(event, cpuctx, ctx);
1957                 }
1958         }
1959         event_sched_out(group_event, cpuctx, ctx);
1960
1961         pmu->cancel_txn(pmu);
1962
1963         perf_cpu_hrtimer_restart(cpuctx);
1964
1965         return -EAGAIN;
1966 }
1967
1968 /*
1969  * Work out whether we can put this event group on the CPU now.
1970  */
1971 static int group_can_go_on(struct perf_event *event,
1972                            struct perf_cpu_context *cpuctx,
1973                            int can_add_hw)
1974 {
1975         /*
1976          * Groups consisting entirely of software events can always go on.
1977          */
1978         if (event->group_flags & PERF_GROUP_SOFTWARE)
1979                 return 1;
1980         /*
1981          * If an exclusive group is already on, no other hardware
1982          * events can go on.
1983          */
1984         if (cpuctx->exclusive)
1985                 return 0;
1986         /*
1987          * If this group is exclusive and there are already
1988          * events on the CPU, it can't go on.
1989          */
1990         if (event->attr.exclusive && cpuctx->active_oncpu)
1991                 return 0;
1992         /*
1993          * Otherwise, try to add it if all previous groups were able
1994          * to go on.
1995          */
1996         return can_add_hw;
1997 }
1998
1999 static void add_event_to_ctx(struct perf_event *event,
2000                                struct perf_event_context *ctx)
2001 {
2002         u64 tstamp = perf_event_time(event);
2003
2004         list_add_event(event, ctx);
2005         perf_group_attach(event);
2006         event->tstamp_enabled = tstamp;
2007         event->tstamp_running = tstamp;
2008         event->tstamp_stopped = tstamp;
2009 }
2010
2011 static void task_ctx_sched_out(struct perf_event_context *ctx);
2012 static void
2013 ctx_sched_in(struct perf_event_context *ctx,
2014              struct perf_cpu_context *cpuctx,
2015              enum event_type_t event_type,
2016              struct task_struct *task);
2017
2018 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2019                                 struct perf_event_context *ctx,
2020                                 struct task_struct *task)
2021 {
2022         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2023         if (ctx)
2024                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2025         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2026         if (ctx)
2027                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2028 }
2029
2030 /*
2031  * Cross CPU call to install and enable a performance event
2032  *
2033  * Must be called with ctx->mutex held
2034  */
2035 static int  __perf_install_in_context(void *info)
2036 {
2037         struct perf_event *event = info;
2038         struct perf_event_context *ctx = event->ctx;
2039         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2040         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2041         struct task_struct *task = current;
2042
2043         perf_ctx_lock(cpuctx, task_ctx);
2044         perf_pmu_disable(cpuctx->ctx.pmu);
2045
2046         /*
2047          * If there was an active task_ctx schedule it out.
2048          */
2049         if (task_ctx)
2050                 task_ctx_sched_out(task_ctx);
2051
2052         /*
2053          * If the context we're installing events in is not the
2054          * active task_ctx, flip them.
2055          */
2056         if (ctx->task && task_ctx != ctx) {
2057                 if (task_ctx)
2058                         raw_spin_unlock(&task_ctx->lock);
2059                 raw_spin_lock(&ctx->lock);
2060                 task_ctx = ctx;
2061         }
2062
2063         if (task_ctx) {
2064                 cpuctx->task_ctx = task_ctx;
2065                 task = task_ctx->task;
2066         }
2067
2068         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2069
2070         update_context_time(ctx);
2071         /*
2072          * update cgrp time only if current cgrp
2073          * matches event->cgrp. Must be done before
2074          * calling add_event_to_ctx()
2075          */
2076         update_cgrp_time_from_event(event);
2077
2078         add_event_to_ctx(event, ctx);
2079
2080         /*
2081          * Schedule everything back in
2082          */
2083         perf_event_sched_in(cpuctx, task_ctx, task);
2084
2085         perf_pmu_enable(cpuctx->ctx.pmu);
2086         perf_ctx_unlock(cpuctx, task_ctx);
2087
2088         return 0;
2089 }
2090
2091 /*
2092  * Attach a performance event to a context
2093  *
2094  * First we add the event to the list with the hardware enable bit
2095  * in event->hw_config cleared.
2096  *
2097  * If the event is attached to a task which is on a CPU we use a smp
2098  * call to enable it in the task context. The task might have been
2099  * scheduled away, but we check this in the smp call again.
2100  */
2101 static void
2102 perf_install_in_context(struct perf_event_context *ctx,
2103                         struct perf_event *event,
2104                         int cpu)
2105 {
2106         struct task_struct *task = ctx->task;
2107
2108         lockdep_assert_held(&ctx->mutex);
2109
2110         event->ctx = ctx;
2111         if (event->cpu != -1)
2112                 event->cpu = cpu;
2113
2114         if (!task) {
2115                 /*
2116                  * Per cpu events are installed via an smp call and
2117                  * the install is always successful.
2118                  */
2119                 cpu_function_call(cpu, __perf_install_in_context, event);
2120                 return;
2121         }
2122
2123 retry:
2124         if (!task_function_call(task, __perf_install_in_context, event))
2125                 return;
2126
2127         raw_spin_lock_irq(&ctx->lock);
2128         /*
2129          * If we failed to find a running task, but find the context active now
2130          * that we've acquired the ctx->lock, retry.
2131          */
2132         if (ctx->is_active) {
2133                 raw_spin_unlock_irq(&ctx->lock);
2134                 /*
2135                  * Reload the task pointer, it might have been changed by
2136                  * a concurrent perf_event_context_sched_out().
2137                  */
2138                 task = ctx->task;
2139                 goto retry;
2140         }
2141
2142         /*
2143          * Since the task isn't running, its safe to add the event, us holding
2144          * the ctx->lock ensures the task won't get scheduled in.
2145          */
2146         add_event_to_ctx(event, ctx);
2147         raw_spin_unlock_irq(&ctx->lock);
2148 }
2149
2150 /*
2151  * Put a event into inactive state and update time fields.
2152  * Enabling the leader of a group effectively enables all
2153  * the group members that aren't explicitly disabled, so we
2154  * have to update their ->tstamp_enabled also.
2155  * Note: this works for group members as well as group leaders
2156  * since the non-leader members' sibling_lists will be empty.
2157  */
2158 static void __perf_event_mark_enabled(struct perf_event *event)
2159 {
2160         struct perf_event *sub;
2161         u64 tstamp = perf_event_time(event);
2162
2163         event->state = PERF_EVENT_STATE_INACTIVE;
2164         event->tstamp_enabled = tstamp - event->total_time_enabled;
2165         list_for_each_entry(sub, &event->sibling_list, group_entry) {
2166                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2167                         sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2168         }
2169 }
2170
2171 /*
2172  * Cross CPU call to enable a performance event
2173  */
2174 static int __perf_event_enable(void *info)
2175 {
2176         struct perf_event *event = info;
2177         struct perf_event_context *ctx = event->ctx;
2178         struct perf_event *leader = event->group_leader;
2179         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2180         int err;
2181
2182         /*
2183          * There's a time window between 'ctx->is_active' check
2184          * in perf_event_enable function and this place having:
2185          *   - IRQs on
2186          *   - ctx->lock unlocked
2187          *
2188          * where the task could be killed and 'ctx' deactivated
2189          * by perf_event_exit_task.
2190          */
2191         if (!ctx->is_active)
2192                 return -EINVAL;
2193
2194         raw_spin_lock(&ctx->lock);
2195         update_context_time(ctx);
2196
2197         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2198                 goto unlock;
2199
2200         /*
2201          * set current task's cgroup time reference point
2202          */
2203         perf_cgroup_set_timestamp(current, ctx);
2204
2205         __perf_event_mark_enabled(event);
2206
2207         if (!event_filter_match(event)) {
2208                 if (is_cgroup_event(event))
2209                         perf_cgroup_defer_enabled(event);
2210                 goto unlock;
2211         }
2212
2213         /*
2214          * If the event is in a group and isn't the group leader,
2215          * then don't put it on unless the group is on.
2216          */
2217         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2218                 goto unlock;
2219
2220         if (!group_can_go_on(event, cpuctx, 1)) {
2221                 err = -EEXIST;
2222         } else {
2223                 if (event == leader)
2224                         err = group_sched_in(event, cpuctx, ctx);
2225                 else
2226                         err = event_sched_in(event, cpuctx, ctx);
2227         }
2228
2229         if (err) {
2230                 /*
2231                  * If this event can't go on and it's part of a
2232                  * group, then the whole group has to come off.
2233                  */
2234                 if (leader != event) {
2235                         group_sched_out(leader, cpuctx, ctx);
2236                         perf_cpu_hrtimer_restart(cpuctx);
2237                 }
2238                 if (leader->attr.pinned) {
2239                         update_group_times(leader);
2240                         leader->state = PERF_EVENT_STATE_ERROR;
2241                 }
2242         }
2243
2244 unlock:
2245         raw_spin_unlock(&ctx->lock);
2246
2247         return 0;
2248 }
2249
2250 /*
2251  * Enable a event.
2252  *
2253  * If event->ctx is a cloned context, callers must make sure that
2254  * every task struct that event->ctx->task could possibly point to
2255  * remains valid.  This condition is satisfied when called through
2256  * perf_event_for_each_child or perf_event_for_each as described
2257  * for perf_event_disable.
2258  */
2259 static void _perf_event_enable(struct perf_event *event)
2260 {
2261         struct perf_event_context *ctx = event->ctx;
2262         struct task_struct *task = ctx->task;
2263
2264         if (!task) {
2265                 /*
2266                  * Enable the event on the cpu that it's on
2267                  */
2268                 cpu_function_call(event->cpu, __perf_event_enable, event);
2269                 return;
2270         }
2271
2272         raw_spin_lock_irq(&ctx->lock);
2273         if (event->state >= PERF_EVENT_STATE_INACTIVE)
2274                 goto out;
2275
2276         /*
2277          * If the event is in error state, clear that first.
2278          * That way, if we see the event in error state below, we
2279          * know that it has gone back into error state, as distinct
2280          * from the task having been scheduled away before the
2281          * cross-call arrived.
2282          */
2283         if (event->state == PERF_EVENT_STATE_ERROR)
2284                 event->state = PERF_EVENT_STATE_OFF;
2285
2286 retry:
2287         if (!ctx->is_active) {
2288                 __perf_event_mark_enabled(event);
2289                 goto out;
2290         }
2291
2292         raw_spin_unlock_irq(&ctx->lock);
2293
2294         if (!task_function_call(task, __perf_event_enable, event))
2295                 return;
2296
2297         raw_spin_lock_irq(&ctx->lock);
2298
2299         /*
2300          * If the context is active and the event is still off,
2301          * we need to retry the cross-call.
2302          */
2303         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2304                 /*
2305                  * task could have been flipped by a concurrent
2306                  * perf_event_context_sched_out()
2307                  */
2308                 task = ctx->task;
2309                 goto retry;
2310         }
2311
2312 out:
2313         raw_spin_unlock_irq(&ctx->lock);
2314 }
2315
2316 /*
2317  * See perf_event_disable();
2318  */
2319 void perf_event_enable(struct perf_event *event)
2320 {
2321         struct perf_event_context *ctx;
2322
2323         ctx = perf_event_ctx_lock(event);
2324         _perf_event_enable(event);
2325         perf_event_ctx_unlock(event, ctx);
2326 }
2327 EXPORT_SYMBOL_GPL(perf_event_enable);
2328
2329 static int _perf_event_refresh(struct perf_event *event, int refresh)
2330 {
2331         /*
2332          * not supported on inherited events
2333          */
2334         if (event->attr.inherit || !is_sampling_event(event))
2335                 return -EINVAL;
2336
2337         atomic_add(refresh, &event->event_limit);
2338         _perf_event_enable(event);
2339
2340         return 0;
2341 }
2342
2343 /*
2344  * See perf_event_disable()
2345  */
2346 int perf_event_refresh(struct perf_event *event, int refresh)
2347 {
2348         struct perf_event_context *ctx;
2349         int ret;
2350
2351         ctx = perf_event_ctx_lock(event);
2352         ret = _perf_event_refresh(event, refresh);
2353         perf_event_ctx_unlock(event, ctx);
2354
2355         return ret;
2356 }
2357 EXPORT_SYMBOL_GPL(perf_event_refresh);
2358
2359 static void ctx_sched_out(struct perf_event_context *ctx,
2360                           struct perf_cpu_context *cpuctx,
2361                           enum event_type_t event_type)
2362 {
2363         struct perf_event *event;
2364         int is_active = ctx->is_active;
2365
2366         ctx->is_active &= ~event_type;
2367         if (likely(!ctx->nr_events))
2368                 return;
2369
2370         update_context_time(ctx);
2371         update_cgrp_time_from_cpuctx(cpuctx);
2372         if (!ctx->nr_active)
2373                 return;
2374
2375         perf_pmu_disable(ctx->pmu);
2376         if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2377                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2378                         group_sched_out(event, cpuctx, ctx);
2379         }
2380
2381         if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2382                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2383                         group_sched_out(event, cpuctx, ctx);
2384         }
2385         perf_pmu_enable(ctx->pmu);
2386 }
2387
2388 /*
2389  * Test whether two contexts are equivalent, i.e. whether they have both been
2390  * cloned from the same version of the same context.
2391  *
2392  * Equivalence is measured using a generation number in the context that is
2393  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2394  * and list_del_event().
2395  */
2396 static int context_equiv(struct perf_event_context *ctx1,
2397                          struct perf_event_context *ctx2)
2398 {
2399         lockdep_assert_held(&ctx1->lock);
2400         lockdep_assert_held(&ctx2->lock);
2401
2402         /* Pinning disables the swap optimization */
2403         if (ctx1->pin_count || ctx2->pin_count)
2404                 return 0;
2405
2406         /* If ctx1 is the parent of ctx2 */
2407         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2408                 return 1;
2409
2410         /* If ctx2 is the parent of ctx1 */
2411         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2412                 return 1;
2413
2414         /*
2415          * If ctx1 and ctx2 have the same parent; we flatten the parent
2416          * hierarchy, see perf_event_init_context().
2417          */
2418         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2419                         ctx1->parent_gen == ctx2->parent_gen)
2420                 return 1;
2421
2422         /* Unmatched */
2423         return 0;
2424 }
2425
2426 static void __perf_event_sync_stat(struct perf_event *event,
2427                                      struct perf_event *next_event)
2428 {
2429         u64 value;
2430
2431         if (!event->attr.inherit_stat)
2432                 return;
2433
2434         /*
2435          * Update the event value, we cannot use perf_event_read()
2436          * because we're in the middle of a context switch and have IRQs
2437          * disabled, which upsets smp_call_function_single(), however
2438          * we know the event must be on the current CPU, therefore we
2439          * don't need to use it.
2440          */
2441         switch (event->state) {
2442         case PERF_EVENT_STATE_ACTIVE:
2443                 event->pmu->read(event);
2444                 /* fall-through */
2445
2446         case PERF_EVENT_STATE_INACTIVE:
2447                 update_event_times(event);
2448                 break;
2449
2450         default:
2451                 break;
2452         }
2453
2454         /*
2455          * In order to keep per-task stats reliable we need to flip the event
2456          * values when we flip the contexts.
2457          */
2458         value = local64_read(&next_event->count);
2459         value = local64_xchg(&event->count, value);
2460         local64_set(&next_event->count, value);
2461
2462         swap(event->total_time_enabled, next_event->total_time_enabled);
2463         swap(event->total_time_running, next_event->total_time_running);
2464
2465         /*
2466          * Since we swizzled the values, update the user visible data too.
2467          */
2468         perf_event_update_userpage(event);
2469         perf_event_update_userpage(next_event);
2470 }
2471
2472 static void perf_event_sync_stat(struct perf_event_context *ctx,
2473                                    struct perf_event_context *next_ctx)
2474 {
2475         struct perf_event *event, *next_event;
2476
2477         if (!ctx->nr_stat)
2478                 return;
2479
2480         update_context_time(ctx);
2481
2482         event = list_first_entry(&ctx->event_list,
2483                                    struct perf_event, event_entry);
2484
2485         next_event = list_first_entry(&next_ctx->event_list,
2486                                         struct perf_event, event_entry);
2487
2488         while (&event->event_entry != &ctx->event_list &&
2489                &next_event->event_entry != &next_ctx->event_list) {
2490
2491                 __perf_event_sync_stat(event, next_event);
2492
2493                 event = list_next_entry(event, event_entry);
2494                 next_event = list_next_entry(next_event, event_entry);
2495         }
2496 }
2497
2498 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2499                                          struct task_struct *next)
2500 {
2501         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2502         struct perf_event_context *next_ctx;
2503         struct perf_event_context *parent, *next_parent;
2504         struct perf_cpu_context *cpuctx;
2505         int do_switch = 1;
2506
2507         if (likely(!ctx))
2508                 return;
2509
2510         cpuctx = __get_cpu_context(ctx);
2511         if (!cpuctx->task_ctx)
2512                 return;
2513
2514         rcu_read_lock();
2515         next_ctx = next->perf_event_ctxp[ctxn];
2516         if (!next_ctx)
2517                 goto unlock;
2518
2519         parent = rcu_dereference(ctx->parent_ctx);
2520         next_parent = rcu_dereference(next_ctx->parent_ctx);
2521
2522         /* If neither context have a parent context; they cannot be clones. */
2523         if (!parent && !next_parent)
2524                 goto unlock;
2525
2526         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2527                 /*
2528                  * Looks like the two contexts are clones, so we might be
2529                  * able to optimize the context switch.  We lock both
2530                  * contexts and check that they are clones under the
2531                  * lock (including re-checking that neither has been
2532                  * uncloned in the meantime).  It doesn't matter which
2533                  * order we take the locks because no other cpu could
2534                  * be trying to lock both of these tasks.
2535                  */
2536                 raw_spin_lock(&ctx->lock);
2537                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2538                 if (context_equiv(ctx, next_ctx)) {
2539                         /*
2540                          * XXX do we need a memory barrier of sorts
2541                          * wrt to rcu_dereference() of perf_event_ctxp
2542                          */
2543                         task->perf_event_ctxp[ctxn] = next_ctx;
2544                         next->perf_event_ctxp[ctxn] = ctx;
2545                         ctx->task = next;
2546                         next_ctx->task = task;
2547
2548                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2549
2550                         do_switch = 0;
2551
2552                         perf_event_sync_stat(ctx, next_ctx);
2553                 }
2554                 raw_spin_unlock(&next_ctx->lock);
2555                 raw_spin_unlock(&ctx->lock);
2556         }
2557 unlock:
2558         rcu_read_unlock();
2559
2560         if (do_switch) {
2561                 raw_spin_lock(&ctx->lock);
2562                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2563                 cpuctx->task_ctx = NULL;
2564                 raw_spin_unlock(&ctx->lock);
2565         }
2566 }
2567
2568 void perf_sched_cb_dec(struct pmu *pmu)
2569 {
2570         this_cpu_dec(perf_sched_cb_usages);
2571 }
2572
2573 void perf_sched_cb_inc(struct pmu *pmu)
2574 {
2575         this_cpu_inc(perf_sched_cb_usages);
2576 }
2577
2578 /*
2579  * This function provides the context switch callback to the lower code
2580  * layer. It is invoked ONLY when the context switch callback is enabled.
2581  */
2582 static void perf_pmu_sched_task(struct task_struct *prev,
2583                                 struct task_struct *next,
2584                                 bool sched_in)
2585 {
2586         struct perf_cpu_context *cpuctx;
2587         struct pmu *pmu;
2588         unsigned long flags;
2589
2590         if (prev == next)
2591                 return;
2592
2593         local_irq_save(flags);
2594
2595         rcu_read_lock();
2596
2597         list_for_each_entry_rcu(pmu, &pmus, entry) {
2598                 if (pmu->sched_task) {
2599                         cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2600
2601                         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2602
2603                         perf_pmu_disable(pmu);
2604
2605                         pmu->sched_task(cpuctx->task_ctx, sched_in);
2606
2607                         perf_pmu_enable(pmu);
2608
2609                         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2610                 }
2611         }
2612
2613         rcu_read_unlock();
2614
2615         local_irq_restore(flags);
2616 }
2617
2618 #define for_each_task_context_nr(ctxn)                                  \
2619         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2620
2621 /*
2622  * Called from scheduler to remove the events of the current task,
2623  * with interrupts disabled.
2624  *
2625  * We stop each event and update the event value in event->count.
2626  *
2627  * This does not protect us against NMI, but disable()
2628  * sets the disabled bit in the control field of event _before_
2629  * accessing the event control register. If a NMI hits, then it will
2630  * not restart the event.
2631  */
2632 void __perf_event_task_sched_out(struct task_struct *task,
2633                                  struct task_struct *next)
2634 {
2635         int ctxn;
2636
2637         if (__this_cpu_read(perf_sched_cb_usages))
2638                 perf_pmu_sched_task(task, next, false);
2639
2640         for_each_task_context_nr(ctxn)
2641                 perf_event_context_sched_out(task, ctxn, next);
2642
2643         /*
2644          * if cgroup events exist on this CPU, then we need
2645          * to check if we have to switch out PMU state.
2646          * cgroup event are system-wide mode only
2647          */
2648         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2649                 perf_cgroup_sched_out(task, next);
2650 }
2651
2652 static void task_ctx_sched_out(struct perf_event_context *ctx)
2653 {
2654         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2655
2656         if (!cpuctx->task_ctx)
2657                 return;
2658
2659         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2660                 return;
2661
2662         ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2663         cpuctx->task_ctx = NULL;
2664 }
2665
2666 /*
2667  * Called with IRQs disabled
2668  */
2669 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2670                               enum event_type_t event_type)
2671 {
2672         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2673 }
2674
2675 static void
2676 ctx_pinned_sched_in(struct perf_event_context *ctx,
2677                     struct perf_cpu_context *cpuctx)
2678 {
2679         struct perf_event *event;
2680
2681         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2682                 if (event->state <= PERF_EVENT_STATE_OFF)
2683                         continue;
2684                 if (!event_filter_match(event))
2685                         continue;
2686
2687                 /* may need to reset tstamp_enabled */
2688                 if (is_cgroup_event(event))
2689                         perf_cgroup_mark_enabled(event, ctx);
2690
2691                 if (group_can_go_on(event, cpuctx, 1))
2692                         group_sched_in(event, cpuctx, ctx);
2693
2694                 /*
2695                  * If this pinned group hasn't been scheduled,
2696                  * put it in error state.
2697                  */
2698                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2699                         update_group_times(event);
2700                         event->state = PERF_EVENT_STATE_ERROR;
2701                 }
2702         }
2703 }
2704
2705 static void
2706 ctx_flexible_sched_in(struct perf_event_context *ctx,
2707                       struct perf_cpu_context *cpuctx)
2708 {
2709         struct perf_event *event;
2710         int can_add_hw = 1;
2711
2712         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2713                 /* Ignore events in OFF or ERROR state */
2714                 if (event->state <= PERF_EVENT_STATE_OFF)
2715                         continue;
2716                 /*
2717                  * Listen to the 'cpu' scheduling filter constraint
2718                  * of events:
2719                  */
2720                 if (!event_filter_match(event))
2721                         continue;
2722
2723                 /* may need to reset tstamp_enabled */
2724                 if (is_cgroup_event(event))
2725                         perf_cgroup_mark_enabled(event, ctx);
2726
2727                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2728                         if (group_sched_in(event, cpuctx, ctx))
2729                                 can_add_hw = 0;
2730                 }
2731         }
2732 }
2733
2734 static void
2735 ctx_sched_in(struct perf_event_context *ctx,
2736              struct perf_cpu_context *cpuctx,
2737              enum event_type_t event_type,
2738              struct task_struct *task)
2739 {
2740         u64 now;
2741         int is_active = ctx->is_active;
2742
2743         ctx->is_active |= event_type;
2744         if (likely(!ctx->nr_events))
2745                 return;
2746
2747         now = perf_clock();
2748         ctx->timestamp = now;
2749         perf_cgroup_set_timestamp(task, ctx);
2750         /*
2751          * First go through the list and put on any pinned groups
2752          * in order to give them the best chance of going on.
2753          */
2754         if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2755                 ctx_pinned_sched_in(ctx, cpuctx);
2756
2757         /* Then walk through the lower prio flexible groups */
2758         if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2759                 ctx_flexible_sched_in(ctx, cpuctx);
2760 }
2761
2762 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2763                              enum event_type_t event_type,
2764                              struct task_struct *task)
2765 {
2766         struct perf_event_context *ctx = &cpuctx->ctx;
2767
2768         ctx_sched_in(ctx, cpuctx, event_type, task);
2769 }
2770
2771 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2772                                         struct task_struct *task)
2773 {
2774         struct perf_cpu_context *cpuctx;
2775
2776         cpuctx = __get_cpu_context(ctx);
2777         if (cpuctx->task_ctx == ctx)
2778                 return;
2779
2780         perf_ctx_lock(cpuctx, ctx);
2781         perf_pmu_disable(ctx->pmu);
2782         /*
2783          * We want to keep the following priority order:
2784          * cpu pinned (that don't need to move), task pinned,
2785          * cpu flexible, task flexible.
2786          */
2787         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2788
2789         if (ctx->nr_events)
2790                 cpuctx->task_ctx = ctx;
2791
2792         perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2793
2794         perf_pmu_enable(ctx->pmu);
2795         perf_ctx_unlock(cpuctx, ctx);
2796 }
2797
2798 /*
2799  * Called from scheduler to add the events of the current task
2800  * with interrupts disabled.
2801  *
2802  * We restore the event value and then enable it.
2803  *
2804  * This does not protect us against NMI, but enable()
2805  * sets the enabled bit in the control field of event _before_
2806  * accessing the event control register. If a NMI hits, then it will
2807  * keep the event running.
2808  */
2809 void __perf_event_task_sched_in(struct task_struct *prev,
2810                                 struct task_struct *task)
2811 {
2812         struct perf_event_context *ctx;
2813         int ctxn;
2814
2815         for_each_task_context_nr(ctxn) {
2816                 ctx = task->perf_event_ctxp[ctxn];
2817                 if (likely(!ctx))
2818                         continue;
2819
2820                 perf_event_context_sched_in(ctx, task);
2821         }
2822         /*
2823          * if cgroup events exist on this CPU, then we need
2824          * to check if we have to switch in PMU state.
2825          * cgroup event are system-wide mode only
2826          */
2827         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2828                 perf_cgroup_sched_in(prev, task);
2829
2830         if (__this_cpu_read(perf_sched_cb_usages))
2831                 perf_pmu_sched_task(prev, task, true);
2832 }
2833
2834 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2835 {
2836         u64 frequency = event->attr.sample_freq;
2837         u64 sec = NSEC_PER_SEC;
2838         u64 divisor, dividend;
2839
2840         int count_fls, nsec_fls, frequency_fls, sec_fls;
2841
2842         count_fls = fls64(count);
2843         nsec_fls = fls64(nsec);
2844         frequency_fls = fls64(frequency);
2845         sec_fls = 30;
2846
2847         /*
2848          * We got @count in @nsec, with a target of sample_freq HZ
2849          * the target period becomes:
2850          *
2851          *             @count * 10^9
2852          * period = -------------------
2853          *          @nsec * sample_freq
2854          *
2855          */
2856
2857         /*
2858          * Reduce accuracy by one bit such that @a and @b converge
2859          * to a similar magnitude.
2860          */
2861 #define REDUCE_FLS(a, b)                \
2862 do {                                    \
2863         if (a##_fls > b##_fls) {        \
2864                 a >>= 1;                \
2865                 a##_fls--;              \
2866         } else {                        \
2867                 b >>= 1;                \
2868                 b##_fls--;              \
2869         }                               \
2870 } while (0)
2871
2872         /*
2873          * Reduce accuracy until either term fits in a u64, then proceed with
2874          * the other, so that finally we can do a u64/u64 division.
2875          */
2876         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2877                 REDUCE_FLS(nsec, frequency);
2878                 REDUCE_FLS(sec, count);
2879         }
2880
2881         if (count_fls + sec_fls > 64) {
2882                 divisor = nsec * frequency;
2883
2884                 while (count_fls + sec_fls > 64) {
2885                         REDUCE_FLS(count, sec);
2886                         divisor >>= 1;
2887                 }
2888
2889                 dividend = count * sec;
2890         } else {
2891                 dividend = count * sec;
2892
2893                 while (nsec_fls + frequency_fls > 64) {
2894                         REDUCE_FLS(nsec, frequency);
2895                         dividend >>= 1;
2896                 }
2897
2898                 divisor = nsec * frequency;
2899         }
2900
2901         if (!divisor)
2902                 return dividend;
2903
2904         return div64_u64(dividend, divisor);
2905 }
2906
2907 static DEFINE_PER_CPU(int, perf_throttled_count);
2908 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2909
2910 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2911 {
2912         struct hw_perf_event *hwc = &event->hw;
2913         s64 period, sample_period;
2914         s64 delta;
2915
2916         period = perf_calculate_period(event, nsec, count);
2917
2918         delta = (s64)(period - hwc->sample_period);
2919         delta = (delta + 7) / 8; /* low pass filter */
2920
2921         sample_period = hwc->sample_period + delta;
2922
2923         if (!sample_period)
2924                 sample_period = 1;
2925
2926         hwc->sample_period = sample_period;
2927
2928         if (local64_read(&hwc->period_left) > 8*sample_period) {
2929                 if (disable)
2930                         event->pmu->stop(event, PERF_EF_UPDATE);
2931
2932                 local64_set(&hwc->period_left, 0);
2933
2934                 if (disable)
2935                         event->pmu->start(event, PERF_EF_RELOAD);
2936         }
2937 }
2938
2939 /*
2940  * combine freq adjustment with unthrottling to avoid two passes over the
2941  * events. At the same time, make sure, having freq events does not change
2942  * the rate of unthrottling as that would introduce bias.
2943  */
2944 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2945                                            int needs_unthr)
2946 {
2947         struct perf_event *event;
2948         struct hw_perf_event *hwc;
2949         u64 now, period = TICK_NSEC;
2950         s64 delta;
2951
2952         /*
2953          * only need to iterate over all events iff:
2954          * - context have events in frequency mode (needs freq adjust)
2955          * - there are events to unthrottle on this cpu
2956          */
2957         if (!(ctx->nr_freq || needs_unthr))
2958                 return;
2959
2960         raw_spin_lock(&ctx->lock);
2961         perf_pmu_disable(ctx->pmu);
2962
2963         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2964                 if (event->state != PERF_EVENT_STATE_ACTIVE)
2965                         continue;
2966
2967                 if (!event_filter_match(event))
2968                         continue;
2969
2970                 perf_pmu_disable(event->pmu);
2971
2972                 hwc = &event->hw;
2973
2974                 if (hwc->interrupts == MAX_INTERRUPTS) {
2975                         hwc->interrupts = 0;
2976                         perf_log_throttle(event, 1);
2977                         event->pmu->start(event, 0);
2978                 }
2979
2980                 if (!event->attr.freq || !event->attr.sample_freq)
2981                         goto next;
2982
2983                 /*
2984                  * stop the event and update event->count
2985                  */
2986                 event->pmu->stop(event, PERF_EF_UPDATE);
2987
2988                 now = local64_read(&event->count);
2989                 delta = now - hwc->freq_count_stamp;
2990                 hwc->freq_count_stamp = now;
2991
2992                 /*
2993                  * restart the event
2994                  * reload only if value has changed
2995                  * we have stopped the event so tell that
2996                  * to perf_adjust_period() to avoid stopping it
2997                  * twice.
2998                  */
2999                 if (delta > 0)
3000                         perf_adjust_period(event, period, delta, false);
3001
3002                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3003         next:
3004                 perf_pmu_enable(event->pmu);
3005         }
3006
3007         perf_pmu_enable(ctx->pmu);
3008         raw_spin_unlock(&ctx->lock);
3009 }
3010
3011 /*
3012  * Round-robin a context's events:
3013  */
3014 static void rotate_ctx(struct perf_event_context *ctx)
3015 {
3016         /*
3017          * Rotate the first entry last of non-pinned groups. Rotation might be
3018          * disabled by the inheritance code.
3019          */
3020         if (!ctx->rotate_disable)
3021                 list_rotate_left(&ctx->flexible_groups);
3022 }
3023
3024 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3025 {
3026         struct perf_event_context *ctx = NULL;
3027         int rotate = 0;
3028
3029         if (cpuctx->ctx.nr_events) {
3030                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3031                         rotate = 1;
3032         }
3033
3034         ctx = cpuctx->task_ctx;
3035         if (ctx && ctx->nr_events) {
3036                 if (ctx->nr_events != ctx->nr_active)
3037                         rotate = 1;
3038         }
3039
3040         if (!rotate)
3041                 goto done;
3042
3043         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3044         perf_pmu_disable(cpuctx->ctx.pmu);
3045
3046         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3047         if (ctx)
3048                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3049
3050         rotate_ctx(&cpuctx->ctx);
3051         if (ctx)
3052                 rotate_ctx(ctx);
3053
3054         perf_event_sched_in(cpuctx, ctx, current);
3055
3056         perf_pmu_enable(cpuctx->ctx.pmu);
3057         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3058 done:
3059
3060         return rotate;
3061 }
3062
3063 #ifdef CONFIG_NO_HZ_FULL
3064 bool perf_event_can_stop_tick(void)
3065 {
3066         if (atomic_read(&nr_freq_events) ||
3067             __this_cpu_read(perf_throttled_count))
3068                 return false;
3069         else
3070                 return true;
3071 }
3072 #endif
3073
3074 void perf_event_task_tick(void)
3075 {
3076         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3077         struct perf_event_context *ctx, *tmp;
3078         int throttled;
3079
3080         WARN_ON(!irqs_disabled());
3081
3082         __this_cpu_inc(perf_throttled_seq);
3083         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3084
3085         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3086                 perf_adjust_freq_unthr_context(ctx, throttled);
3087 }
3088
3089 static int event_enable_on_exec(struct perf_event *event,
3090                                 struct perf_event_context *ctx)
3091 {
3092         if (!event->attr.enable_on_exec)
3093                 return 0;
3094
3095         event->attr.enable_on_exec = 0;
3096         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3097                 return 0;
3098
3099         __perf_event_mark_enabled(event);
3100
3101         return 1;
3102 }
3103
3104 /*
3105  * Enable all of a task's events that have been marked enable-on-exec.
3106  * This expects task == current.
3107  */
3108 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3109 {
3110         struct perf_event_context *clone_ctx = NULL;
3111         struct perf_event *event;
3112         unsigned long flags;
3113         int enabled = 0;
3114         int ret;
3115
3116         local_irq_save(flags);
3117         if (!ctx || !ctx->nr_events)
3118                 goto out;
3119
3120         /*
3121          * We must ctxsw out cgroup events to avoid conflict
3122          * when invoking perf_task_event_sched_in() later on
3123          * in this function. Otherwise we end up trying to
3124          * ctxswin cgroup events which are already scheduled
3125          * in.
3126          */
3127         perf_cgroup_sched_out(current, NULL);
3128
3129         raw_spin_lock(&ctx->lock);
3130         task_ctx_sched_out(ctx);
3131
3132         list_for_each_entry(event, &ctx->event_list, event_entry) {
3133                 ret = event_enable_on_exec(event, ctx);
3134                 if (ret)
3135                         enabled = 1;
3136         }
3137
3138         /*
3139          * Unclone this context if we enabled any event.
3140          */
3141         if (enabled)
3142                 clone_ctx = unclone_ctx(ctx);
3143
3144         raw_spin_unlock(&ctx->lock);
3145
3146         /*
3147          * Also calls ctxswin for cgroup events, if any:
3148          */
3149         perf_event_context_sched_in(ctx, ctx->task);
3150 out:
3151         local_irq_restore(flags);
3152
3153         if (clone_ctx)
3154                 put_ctx(clone_ctx);
3155 }
3156
3157 void perf_event_exec(void)
3158 {
3159         struct perf_event_context *ctx;
3160         int ctxn;
3161
3162         rcu_read_lock();
3163         for_each_task_context_nr(ctxn) {
3164                 ctx = current->perf_event_ctxp[ctxn];
3165                 if (!ctx)
3166                         continue;
3167
3168                 perf_event_enable_on_exec(ctx);
3169         }
3170         rcu_read_unlock();
3171 }
3172
3173 /*
3174  * Cross CPU call to read the hardware event
3175  */
3176 static void __perf_event_read(void *info)
3177 {
3178         struct perf_event *event = info;
3179         struct perf_event_context *ctx = event->ctx;
3180         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3181
3182         /*
3183          * If this is a task context, we need to check whether it is
3184          * the current task context of this cpu.  If not it has been
3185          * scheduled out before the smp call arrived.  In that case
3186          * event->count would have been updated to a recent sample
3187          * when the event was scheduled out.
3188          */
3189         if (ctx->task && cpuctx->task_ctx != ctx)
3190                 return;
3191
3192         raw_spin_lock(&ctx->lock);
3193         if (ctx->is_active) {
3194                 update_context_time(ctx);
3195                 update_cgrp_time_from_event(event);
3196         }
3197         update_event_times(event);
3198         if (event->state == PERF_EVENT_STATE_ACTIVE)
3199                 event->pmu->read(event);
3200         raw_spin_unlock(&ctx->lock);
3201 }
3202
3203 static inline u64 perf_event_count(struct perf_event *event)
3204 {
3205         if (event->pmu->count)
3206                 return event->pmu->count(event);
3207
3208         return __perf_event_count(event);
3209 }
3210
3211 static u64 perf_event_read(struct perf_event *event)
3212 {
3213         /*
3214          * If event is enabled and currently active on a CPU, update the
3215          * value in the event structure:
3216          */
3217         if (event->state == PERF_EVENT_STATE_ACTIVE) {
3218                 smp_call_function_single(event->oncpu,
3219                                          __perf_event_read, event, 1);
3220         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3221                 struct perf_event_context *ctx = event->ctx;
3222                 unsigned long flags;
3223
3224                 raw_spin_lock_irqsave(&ctx->lock, flags);
3225                 /*
3226                  * may read while context is not active
3227                  * (e.g., thread is blocked), in that case
3228                  * we cannot update context time
3229                  */
3230                 if (ctx->is_active) {
3231                         update_context_time(ctx);
3232                         update_cgrp_time_from_event(event);
3233                 }
3234                 update_event_times(event);
3235                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3236         }
3237
3238         return perf_event_count(event);
3239 }
3240
3241 /*
3242  * Initialize the perf_event context in a task_struct:
3243  */
3244 static void __perf_event_init_context(struct perf_event_context *ctx)
3245 {
3246         raw_spin_lock_init(&ctx->lock);
3247         mutex_init(&ctx->mutex);
3248         INIT_LIST_HEAD(&ctx->active_ctx_list);
3249         INIT_LIST_HEAD(&ctx->pinned_groups);
3250         INIT_LIST_HEAD(&ctx->flexible_groups);
3251         INIT_LIST_HEAD(&ctx->event_list);
3252         atomic_set(&ctx->refcount, 1);
3253         INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3254 }
3255
3256 static struct perf_event_context *
3257 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3258 {
3259         struct perf_event_context *ctx;
3260
3261         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3262         if (!ctx)
3263                 return NULL;
3264
3265         __perf_event_init_context(ctx);
3266         if (task) {
3267                 ctx->task = task;
3268                 get_task_struct(task);
3269         }
3270         ctx->pmu = pmu;
3271
3272         return ctx;
3273 }
3274
3275 static struct task_struct *
3276 find_lively_task_by_vpid(pid_t vpid)
3277 {
3278         struct task_struct *task;
3279         int err;
3280
3281         rcu_read_lock();
3282         if (!vpid)
3283                 task = current;
3284         else
3285                 task = find_task_by_vpid(vpid);
3286         if (task)
3287                 get_task_struct(task);
3288         rcu_read_unlock();
3289
3290         if (!task)
3291                 return ERR_PTR(-ESRCH);
3292
3293         /* Reuse ptrace permission checks for now. */
3294         err = -EACCES;
3295         if (!ptrace_may_access(task, PTRACE_MODE_READ))
3296                 goto errout;
3297
3298         return task;
3299 errout:
3300         put_task_struct(task);
3301         return ERR_PTR(err);
3302
3303 }
3304
3305 /*
3306  * Returns a matching context with refcount and pincount.
3307  */
3308 static struct perf_event_context *
3309 find_get_context(struct pmu *pmu, struct task_struct *task,
3310                 struct perf_event *event)
3311 {
3312         struct perf_event_context *ctx, *clone_ctx = NULL;
3313         struct perf_cpu_context *cpuctx;
3314         void *task_ctx_data = NULL;
3315         unsigned long flags;
3316         int ctxn, err;
3317         int cpu = event->cpu;
3318
3319         if (!task) {
3320                 /* Must be root to operate on a CPU event: */
3321                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3322                         return ERR_PTR(-EACCES);
3323
3324                 /*
3325                  * We could be clever and allow to attach a event to an
3326                  * offline CPU and activate it when the CPU comes up, but
3327                  * that's for later.
3328                  */
3329                 if (!cpu_online(cpu))
3330                         return ERR_PTR(-ENODEV);
3331
3332                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3333                 ctx = &cpuctx->ctx;
3334                 get_ctx(ctx);
3335                 ++ctx->pin_count;
3336
3337                 return ctx;
3338         }
3339
3340         err = -EINVAL;
3341         ctxn = pmu->task_ctx_nr;
3342         if (ctxn < 0)
3343                 goto errout;
3344
3345         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3346                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3347                 if (!task_ctx_data) {
3348                         err = -ENOMEM;
3349                         goto errout;
3350                 }
3351         }
3352
3353 retry:
3354         ctx = perf_lock_task_context(task, ctxn, &flags);
3355         if (ctx) {
3356                 clone_ctx = unclone_ctx(ctx);
3357                 ++ctx->pin_count;
3358
3359                 if (task_ctx_data && !ctx->task_ctx_data) {
3360                         ctx->task_ctx_data = task_ctx_data;
3361                         task_ctx_data = NULL;
3362                 }
3363                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3364
3365                 if (clone_ctx)
3366                         put_ctx(clone_ctx);
3367         } else {
3368                 ctx = alloc_perf_context(pmu, task);
3369                 err = -ENOMEM;
3370                 if (!ctx)
3371                         goto errout;
3372
3373                 if (task_ctx_data) {
3374                         ctx->task_ctx_data = task_ctx_data;
3375                         task_ctx_data = NULL;
3376                 }
3377
3378                 err = 0;
3379                 mutex_lock(&task->perf_event_mutex);
3380                 /*
3381                  * If it has already passed perf_event_exit_task().
3382                  * we must see PF_EXITING, it takes this mutex too.
3383                  */
3384                 if (task->flags & PF_EXITING)
3385                         err = -ESRCH;
3386                 else if (task->perf_event_ctxp[ctxn])
3387                         err = -EAGAIN;
3388                 else {
3389                         get_ctx(ctx);
3390                         ++ctx->pin_count;
3391                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3392                 }
3393                 mutex_unlock(&task->perf_event_mutex);
3394
3395                 if (unlikely(err)) {
3396                         put_ctx(ctx);
3397
3398                         if (err == -EAGAIN)
3399                                 goto retry;
3400                         goto errout;
3401                 }
3402         }
3403
3404         kfree(task_ctx_data);
3405         return ctx;
3406
3407 errout:
3408         kfree(task_ctx_data);
3409         return ERR_PTR(err);
3410 }
3411
3412 static void perf_event_free_filter(struct perf_event *event);
3413 static void perf_event_free_bpf_prog(struct perf_event *event);
3414
3415 static void free_event_rcu(struct rcu_head *head)
3416 {
3417         struct perf_event *event;
3418
3419         event = container_of(head, struct perf_event, rcu_head);
3420         if (event->ns)
3421                 put_pid_ns(event->ns);
3422         perf_event_free_filter(event);
3423         perf_event_free_bpf_prog(event);
3424         kfree(event);
3425 }
3426
3427 static void ring_buffer_attach(struct perf_event *event,
3428                                struct ring_buffer *rb);
3429
3430 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3431 {
3432         if (event->parent)
3433                 return;
3434
3435         if (is_cgroup_event(event))
3436                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3437 }
3438
3439 static void unaccount_event(struct perf_event *event)
3440 {
3441         if (event->parent)
3442                 return;
3443
3444         if (event->attach_state & PERF_ATTACH_TASK)
3445                 static_key_slow_dec_deferred(&perf_sched_events);
3446         if (event->attr.mmap || event->attr.mmap_data)
3447                 atomic_dec(&nr_mmap_events);
3448         if (event->attr.comm)
3449                 atomic_dec(&nr_comm_events);
3450         if (event->attr.task)
3451                 atomic_dec(&nr_task_events);
3452         if (event->attr.freq)
3453                 atomic_dec(&nr_freq_events);
3454         if (is_cgroup_event(event))
3455                 static_key_slow_dec_deferred(&perf_sched_events);
3456         if (has_branch_stack(event))
3457                 static_key_slow_dec_deferred(&perf_sched_events);
3458
3459         unaccount_event_cpu(event, event->cpu);
3460 }
3461
3462 /*
3463  * The following implement mutual exclusion of events on "exclusive" pmus
3464  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3465  * at a time, so we disallow creating events that might conflict, namely:
3466  *
3467  *  1) cpu-wide events in the presence of per-task events,
3468  *  2) per-task events in the presence of cpu-wide events,
3469  *  3) two matching events on the same context.
3470  *
3471  * The former two cases are handled in the allocation path (perf_event_alloc(),
3472  * __free_event()), the latter -- before the first perf_install_in_context().
3473  */
3474 static int exclusive_event_init(struct perf_event *event)
3475 {
3476         struct pmu *pmu = event->pmu;
3477
3478         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3479                 return 0;
3480
3481         /*
3482          * Prevent co-existence of per-task and cpu-wide events on the
3483          * same exclusive pmu.
3484          *
3485          * Negative pmu::exclusive_cnt means there are cpu-wide
3486          * events on this "exclusive" pmu, positive means there are
3487          * per-task events.
3488          *
3489          * Since this is called in perf_event_alloc() path, event::ctx
3490          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3491          * to mean "per-task event", because unlike other attach states it
3492          * never gets cleared.
3493          */
3494         if (event->attach_state & PERF_ATTACH_TASK) {
3495                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3496                         return -EBUSY;
3497         } else {
3498                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3499                         return -EBUSY;
3500         }
3501
3502         return 0;
3503 }
3504
3505 static void exclusive_event_destroy(struct perf_event *event)
3506 {
3507         struct pmu *pmu = event->pmu;
3508
3509         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3510                 return;
3511
3512         /* see comment in exclusive_event_init() */
3513         if (event->attach_state & PERF_ATTACH_TASK)
3514                 atomic_dec(&pmu->exclusive_cnt);
3515         else
3516                 atomic_inc(&pmu->exclusive_cnt);
3517 }
3518
3519 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3520 {
3521         if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3522             (e1->cpu == e2->cpu ||
3523              e1->cpu == -1 ||
3524              e2->cpu == -1))
3525                 return true;
3526         return false;
3527 }
3528
3529 /* Called under the same ctx::mutex as perf_install_in_context() */
3530 static bool exclusive_event_installable(struct perf_event *event,
3531                                         struct perf_event_context *ctx)
3532 {
3533         struct perf_event *iter_event;
3534         struct pmu *pmu = event->pmu;
3535
3536         if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3537                 return true;
3538
3539         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3540                 if (exclusive_event_match(iter_event, event))
3541                         return false;
3542         }
3543
3544         return true;
3545 }
3546
3547 static void __free_event(struct perf_event *event)
3548 {
3549         if (!event->parent) {
3550                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3551                         put_callchain_buffers();
3552         }
3553
3554         if (event->destroy)
3555                 event->destroy(event);
3556
3557         if (event->ctx)
3558                 put_ctx(event->ctx);
3559
3560         if (event->pmu) {
3561                 exclusive_event_destroy(event);
3562                 module_put(event->pmu->module);
3563         }
3564
3565         call_rcu(&event->rcu_head, free_event_rcu);
3566 }
3567
3568 static void _free_event(struct perf_event *event)
3569 {
3570         irq_work_sync(&event->pending);
3571
3572         unaccount_event(event);
3573
3574         if (event->rb) {
3575                 /*
3576                  * Can happen when we close an event with re-directed output.
3577                  *
3578                  * Since we have a 0 refcount, perf_mmap_close() will skip
3579                  * over us; possibly making our ring_buffer_put() the last.
3580                  */
3581                 mutex_lock(&event->mmap_mutex);
3582                 ring_buffer_attach(event, NULL);
3583                 mutex_unlock(&event->mmap_mutex);
3584         }
3585
3586         if (is_cgroup_event(event))
3587                 perf_detach_cgroup(event);
3588
3589         __free_event(event);
3590 }
3591
3592 /*
3593  * Used to free events which have a known refcount of 1, such as in error paths
3594  * where the event isn't exposed yet and inherited events.
3595  */
3596 static void free_event(struct perf_event *event)
3597 {
3598         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3599                                 "unexpected event refcount: %ld; ptr=%p\n",
3600                                 atomic_long_read(&event->refcount), event)) {
3601                 /* leak to avoid use-after-free */
3602                 return;
3603         }
3604
3605         _free_event(event);
3606 }
3607
3608 /*
3609  * Remove user event from the owner task.
3610  */
3611 static void perf_remove_from_owner(struct perf_event *event)
3612 {
3613         struct task_struct *owner;
3614
3615         rcu_read_lock();
3616         owner = ACCESS_ONCE(event->owner);
3617         /*
3618          * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3619          * !owner it means the list deletion is complete and we can indeed
3620          * free this event, otherwise we need to serialize on
3621          * owner->perf_event_mutex.
3622          */
3623         smp_read_barrier_depends();
3624         if (owner) {
3625                 /*
3626                  * Since delayed_put_task_struct() also drops the last
3627                  * task reference we can safely take a new reference
3628                  * while holding the rcu_read_lock().
3629                  */
3630                 get_task_struct(owner);
3631         }
3632         rcu_read_unlock();
3633
3634         if (owner) {
3635                 /*
3636                  * If we're here through perf_event_exit_task() we're already
3637                  * holding ctx->mutex which would be an inversion wrt. the
3638                  * normal lock order.
3639                  *
3640                  * However we can safely take this lock because its the child
3641                  * ctx->mutex.
3642                  */
3643                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3644
3645                 /*
3646                  * We have to re-check the event->owner field, if it is cleared
3647                  * we raced with perf_event_exit_task(), acquiring the mutex
3648                  * ensured they're done, and we can proceed with freeing the
3649                  * event.
3650                  */
3651                 if (event->owner)
3652                         list_del_init(&event->owner_entry);
3653                 mutex_unlock(&owner->perf_event_mutex);
3654                 put_task_struct(owner);
3655         }
3656 }
3657
3658 /*
3659  * Called when the last reference to the file is gone.
3660  */
3661 static void put_event(struct perf_event *event)
3662 {
3663         struct perf_event_context *ctx;
3664
3665         if (!atomic_long_dec_and_test(&event->refcount))
3666                 return;
3667
3668         if (!is_kernel_event(event))
3669                 perf_remove_from_owner(event);
3670
3671         /*
3672          * There are two ways this annotation is useful:
3673          *
3674          *  1) there is a lock recursion from perf_event_exit_task
3675          *     see the comment there.
3676          *
3677          *  2) there is a lock-inversion with mmap_sem through
3678          *     perf_event_read_group(), which takes faults while
3679          *     holding ctx->mutex, however this is called after
3680          *     the last filedesc died, so there is no possibility
3681          *     to trigger the AB-BA case.
3682          */
3683         ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3684         WARN_ON_ONCE(ctx->parent_ctx);
3685         perf_remove_from_context(event, true);
3686         perf_event_ctx_unlock(event, ctx);
3687
3688         _free_event(event);
3689 }
3690
3691 int perf_event_release_kernel(struct perf_event *event)
3692 {
3693         put_event(event);
3694         return 0;
3695 }
3696 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3697
3698 static int perf_release(struct inode *inode, struct file *file)
3699 {
3700         put_event(file->private_data);
3701         return 0;
3702 }
3703
3704 /*
3705  * Remove all orphanes events from the context.
3706  */
3707 static void orphans_remove_work(struct work_struct *work)
3708 {
3709         struct perf_event_context *ctx;
3710         struct perf_event *event, *tmp;
3711
3712         ctx = container_of(work, struct perf_event_context,
3713                            orphans_remove.work);
3714
3715         mutex_lock(&ctx->mutex);
3716         list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3717                 struct perf_event *parent_event = event->parent;
3718
3719                 if (!is_orphaned_child(event))
3720                         continue;
3721
3722                 perf_remove_from_context(event, true);
3723
3724                 mutex_lock(&parent_event->child_mutex);
3725                 list_del_init(&event->child_list);
3726                 mutex_unlock(&parent_event->child_mutex);
3727
3728                 free_event(event);
3729                 put_event(parent_event);
3730         }
3731
3732         raw_spin_lock_irq(&ctx->lock);
3733         ctx->orphans_remove_sched = false;
3734         raw_spin_unlock_irq(&ctx->lock);
3735         mutex_unlock(&ctx->mutex);
3736
3737         put_ctx(ctx);
3738 }
3739
3740 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3741 {
3742         struct perf_event *child;
3743         u64 total = 0;
3744
3745         *enabled = 0;
3746         *running = 0;
3747
3748         mutex_lock(&event->child_mutex);
3749         total += perf_event_read(event);
3750         *enabled += event->total_time_enabled +
3751                         atomic64_read(&event->child_total_time_enabled);
3752         *running += event->total_time_running +
3753                         atomic64_read(&event->child_total_time_running);
3754
3755         list_for_each_entry(child, &event->child_list, child_list) {
3756                 total += perf_event_read(child);
3757                 *enabled += child->total_time_enabled;
3758                 *running += child->total_time_running;
3759         }
3760         mutex_unlock(&event->child_mutex);
3761
3762         return total;
3763 }
3764 EXPORT_SYMBOL_GPL(perf_event_read_value);
3765
3766 static int perf_event_read_group(struct perf_event *event,
3767                                    u64 read_format, char __user *buf)
3768 {
3769         struct perf_event *leader = event->group_leader, *sub;
3770         struct perf_event_context *ctx = leader->ctx;
3771         int n = 0, size = 0, ret;
3772         u64 count, enabled, running;
3773         u64 values[5];
3774
3775         lockdep_assert_held(&ctx->mutex);
3776
3777         count = perf_event_read_value(leader, &enabled, &running);
3778
3779         values[n++] = 1 + leader->nr_siblings;
3780         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3781                 values[n++] = enabled;
3782         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3783                 values[n++] = running;
3784         values[n++] = count;
3785         if (read_format & PERF_FORMAT_ID)
3786                 values[n++] = primary_event_id(leader);
3787
3788         size = n * sizeof(u64);
3789
3790         if (copy_to_user(buf, values, size))
3791                 return -EFAULT;
3792
3793         ret = size;
3794
3795         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3796                 n = 0;
3797
3798                 values[n++] = perf_event_read_value(sub, &enabled, &running);
3799                 if (read_format & PERF_FORMAT_ID)
3800                         values[n++] = primary_event_id(sub);
3801
3802                 size = n * sizeof(u64);
3803
3804                 if (copy_to_user(buf + ret, values, size)) {
3805                         return -EFAULT;
3806                 }
3807
3808                 ret += size;
3809         }
3810
3811         return ret;
3812 }
3813
3814 static int perf_event_read_one(struct perf_event *event,
3815                                  u64 read_format, char __user *buf)
3816 {
3817         u64 enabled, running;
3818         u64 values[4];
3819         int n = 0;
3820
3821         values[n++] = perf_event_read_value(event, &enabled, &running);
3822         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3823                 values[n++] = enabled;
3824         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3825                 values[n++] = running;
3826         if (read_format & PERF_FORMAT_ID)
3827                 values[n++] = primary_event_id(event);
3828
3829         if (copy_to_user(buf, values, n * sizeof(u64)))
3830                 return -EFAULT;
3831
3832         return n * sizeof(u64);
3833 }
3834
3835 static bool is_event_hup(struct perf_event *event)
3836 {
3837         bool no_children;
3838
3839         if (event->state != PERF_EVENT_STATE_EXIT)
3840                 return false;
3841
3842         mutex_lock(&event->child_mutex);
3843         no_children = list_empty(&event->child_list);
3844         mutex_unlock(&event->child_mutex);
3845         return no_children;
3846 }
3847
3848 /*
3849  * Read the performance event - simple non blocking version for now
3850  */
3851 static ssize_t
3852 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3853 {
3854         u64 read_format = event->attr.read_format;
3855         int ret;
3856
3857         /*
3858          * Return end-of-file for a read on a event that is in
3859          * error state (i.e. because it was pinned but it couldn't be
3860          * scheduled on to the CPU at some point).
3861          */
3862         if (event->state == PERF_EVENT_STATE_ERROR)
3863                 return 0;
3864
3865         if (count < event->read_size)
3866                 return -ENOSPC;
3867
3868         WARN_ON_ONCE(event->ctx->parent_ctx);
3869         if (read_format & PERF_FORMAT_GROUP)
3870                 ret = perf_event_read_group(event, read_format, buf);
3871         else
3872                 ret = perf_event_read_one(event, read_format, buf);
3873
3874         return ret;
3875 }
3876
3877 static ssize_t
3878 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3879 {
3880         struct perf_event *event = file->private_data;
3881         struct perf_event_context *ctx;
3882         int ret;
3883
3884         ctx = perf_event_ctx_lock(event);
3885         ret = perf_read_hw(event, buf, count);
3886         perf_event_ctx_unlock(event, ctx);
3887
3888         return ret;
3889 }
3890
3891 static unsigned int perf_poll(struct file *file, poll_table *wait)
3892 {
3893         struct perf_event *event = file->private_data;
3894         struct ring_buffer *rb;
3895         unsigned int events = POLLHUP;
3896
3897         poll_wait(file, &event->waitq, wait);
3898
3899         if (is_event_hup(event))
3900                 return events;
3901
3902         /*
3903          * Pin the event->rb by taking event->mmap_mutex; otherwise
3904          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3905          */
3906         mutex_lock(&event->mmap_mutex);
3907         rb = event->rb;
3908         if (rb)
3909                 events = atomic_xchg(&rb->poll, 0);
3910         mutex_unlock(&event->mmap_mutex);
3911         return events;
3912 }
3913
3914 static void _perf_event_reset(struct perf_event *event)
3915 {
3916         (void)perf_event_read(event);
3917         local64_set(&event->count, 0);
3918         perf_event_update_userpage(event);
3919 }
3920
3921 /*
3922  * Holding the top-level event's child_mutex means that any
3923  * descendant process that has inherited this event will block
3924  * in sync_child_event if it goes to exit, thus satisfying the
3925  * task existence requirements of perf_event_enable/disable.
3926  */
3927 static void perf_event_for_each_child(struct perf_event *event,
3928                                         void (*func)(struct perf_event *))
3929 {
3930         struct perf_event *child;
3931
3932         WARN_ON_ONCE(event->ctx->parent_ctx);
3933
3934         mutex_lock(&event->child_mutex);
3935         func(event);
3936         list_for_each_entry(child, &event->child_list, child_list)
3937                 func(child);
3938         mutex_unlock(&event->child_mutex);
3939 }
3940
3941 static void perf_event_for_each(struct perf_event *event,
3942                                   void (*func)(struct perf_event *))
3943 {
3944         struct perf_event_context *ctx = event->ctx;
3945         struct perf_event *sibling;
3946
3947         lockdep_assert_held(&ctx->mutex);
3948
3949         event = event->group_leader;
3950
3951         perf_event_for_each_child(event, func);
3952         list_for_each_entry(sibling, &event->sibling_list, group_entry)
3953                 perf_event_for_each_child(sibling, func);
3954 }
3955
3956 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3957 {
3958         struct perf_event_context *ctx = event->ctx;
3959         int ret = 0, active;
3960         u64 value;
3961
3962         if (!is_sampling_event(event))
3963                 return -EINVAL;
3964
3965         if (copy_from_user(&value, arg, sizeof(value)))
3966                 return -EFAULT;
3967
3968         if (!value)
3969                 return -EINVAL;
3970
3971         raw_spin_lock_irq(&ctx->lock);
3972         if (event->attr.freq) {
3973                 if (value > sysctl_perf_event_sample_rate) {
3974                         ret = -EINVAL;
3975                         goto unlock;
3976                 }
3977
3978                 event->attr.sample_freq = value;
3979         } else {
3980                 event->attr.sample_period = value;
3981                 event->hw.sample_period = value;
3982         }
3983
3984         active = (event->state == PERF_EVENT_STATE_ACTIVE);
3985         if (active) {
3986                 perf_pmu_disable(ctx->pmu);
3987                 event->pmu->stop(event, PERF_EF_UPDATE);
3988         }
3989
3990         local64_set(&event->hw.period_left, 0);
3991
3992         if (active) {
3993                 event->pmu->start(event, PERF_EF_RELOAD);
3994                 perf_pmu_enable(ctx->pmu);
3995         }
3996
3997 unlock:
3998         raw_spin_unlock_irq(&ctx->lock);
3999
4000         return ret;
4001 }
4002
4003 static const struct file_operations perf_fops;
4004
4005 static inline int perf_fget_light(int fd, struct fd *p)
4006 {
4007         struct fd f = fdget(fd);
4008         if (!f.file)
4009                 return -EBADF;
4010
4011         if (f.file->f_op != &perf_fops) {
4012                 fdput(f);
4013                 return -EBADF;
4014         }
4015         *p = f;
4016         return 0;
4017 }
4018
4019 static int perf_event_set_output(struct perf_event *event,
4020                                  struct perf_event *output_event);
4021 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4022 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4023
4024 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4025 {
4026         void (*func)(struct perf_event *);
4027         u32 flags = arg;
4028
4029         switch (cmd) {
4030         case PERF_EVENT_IOC_ENABLE:
4031                 func = _perf_event_enable;
4032                 break;
4033         case PERF_EVENT_IOC_DISABLE:
4034                 func = _perf_event_disable;
4035                 break;
4036         case PERF_EVENT_IOC_RESET:
4037                 func = _perf_event_reset;
4038                 break;
4039
4040         case PERF_EVENT_IOC_REFRESH:
4041                 return _perf_event_refresh(event, arg);
4042
4043         case PERF_EVENT_IOC_PERIOD:
4044                 return perf_event_period(event, (u64 __user *)arg);
4045
4046         case PERF_EVENT_IOC_ID:
4047         {
4048                 u64 id = primary_event_id(event);
4049
4050                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4051                         return -EFAULT;
4052                 return 0;
4053         }
4054
4055         case PERF_EVENT_IOC_SET_OUTPUT:
4056         {
4057                 int ret;
4058                 if (arg != -1) {
4059                         struct perf_event *output_event;
4060                         struct fd output;
4061                         ret = perf_fget_light(arg, &output);
4062                         if (ret)
4063                                 return ret;
4064                         output_event = output.file->private_data;
4065                         ret = perf_event_set_output(event, output_event);
4066                         fdput(output);
4067                 } else {
4068                         ret = perf_event_set_output(event, NULL);
4069                 }
4070                 return ret;
4071         }
4072
4073         case PERF_EVENT_IOC_SET_FILTER:
4074                 return perf_event_set_filter(event, (void __user *)arg);
4075
4076         case PERF_EVENT_IOC_SET_BPF:
4077                 return perf_event_set_bpf_prog(event, arg);
4078
4079         default:
4080                 return -ENOTTY;
4081         }
4082
4083         if (flags & PERF_IOC_FLAG_GROUP)
4084                 perf_event_for_each(event, func);
4085         else
4086                 perf_event_for_each_child(event, func);
4087
4088         return 0;
4089 }
4090
4091 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4092 {
4093         struct perf_event *event = file->private_data;
4094         struct perf_event_context *ctx;
4095         long ret;
4096
4097         ctx = perf_event_ctx_lock(event);
4098         ret = _perf_ioctl(event, cmd, arg);
4099         perf_event_ctx_unlock(event, ctx);
4100
4101         return ret;
4102 }
4103
4104 #ifdef CONFIG_COMPAT
4105 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4106                                 unsigned long arg)
4107 {
4108         switch (_IOC_NR(cmd)) {
4109         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4110         case _IOC_NR(PERF_EVENT_IOC_ID):
4111                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4112                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4113                         cmd &= ~IOCSIZE_MASK;
4114                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4115                 }
4116                 break;
4117         }
4118         return perf_ioctl(file, cmd, arg);
4119 }
4120 #else
4121 # define perf_compat_ioctl NULL
4122 #endif
4123
4124 int perf_event_task_enable(void)
4125 {
4126         struct perf_event_context *ctx;
4127         struct perf_event *event;
4128
4129         mutex_lock(&current->perf_event_mutex);
4130         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4131                 ctx = perf_event_ctx_lock(event);
4132                 perf_event_for_each_child(event, _perf_event_enable);
4133                 perf_event_ctx_unlock(event, ctx);
4134         }
4135         mutex_unlock(&current->perf_event_mutex);
4136
4137         return 0;
4138 }
4139
4140 int perf_event_task_disable(void)
4141 {
4142         struct perf_event_context *ctx;
4143         struct perf_event *event;
4144
4145         mutex_lock(&current->perf_event_mutex);
4146         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4147                 ctx = perf_event_ctx_lock(event);
4148                 perf_event_for_each_child(event, _perf_event_disable);
4149                 perf_event_ctx_unlock(event, ctx);
4150         }
4151         mutex_unlock(&current->perf_event_mutex);
4152
4153         return 0;
4154 }
4155
4156 static int perf_event_index(struct perf_event *event)
4157 {
4158         if (event->hw.state & PERF_HES_STOPPED)
4159                 return 0;
4160
4161         if (event->state != PERF_EVENT_STATE_ACTIVE)
4162                 return 0;
4163
4164         return event->pmu->event_idx(event);
4165 }
4166
4167 static void calc_timer_values(struct perf_event *event,
4168                                 u64 *now,
4169                                 u64 *enabled,
4170                                 u64 *running)
4171 {
4172         u64 ctx_time;
4173
4174         *now = perf_clock();
4175         ctx_time = event->shadow_ctx_time + *now;
4176         *enabled = ctx_time - event->tstamp_enabled;
4177         *running = ctx_time - event->tstamp_running;
4178 }
4179
4180 static void perf_event_init_userpage(struct perf_event *event)
4181 {
4182         struct perf_event_mmap_page *userpg;
4183         struct ring_buffer *rb;
4184
4185         rcu_read_lock();
4186         rb = rcu_dereference(event->rb);
4187         if (!rb)
4188                 goto unlock;
4189
4190         userpg = rb->user_page;
4191
4192         /* Allow new userspace to detect that bit 0 is deprecated */
4193         userpg->cap_bit0_is_deprecated = 1;
4194         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4195         userpg->data_offset = PAGE_SIZE;
4196         userpg->data_size = perf_data_size(rb);
4197
4198 unlock:
4199         rcu_read_unlock();
4200 }
4201
4202 void __weak arch_perf_update_userpage(
4203         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4204 {
4205 }
4206
4207 /*
4208  * Callers need to ensure there can be no nesting of this function, otherwise
4209  * the seqlock logic goes bad. We can not serialize this because the arch
4210  * code calls this from NMI context.
4211  */
4212 void perf_event_update_userpage(struct perf_event *event)
4213 {
4214         struct perf_event_mmap_page *userpg;
4215         struct ring_buffer *rb;
4216         u64 enabled, running, now;
4217
4218         rcu_read_lock();
4219         rb = rcu_dereference(event->rb);
4220         if (!rb)
4221                 goto unlock;
4222
4223         /*
4224          * compute total_time_enabled, total_time_running
4225          * based on snapshot values taken when the event
4226          * was last scheduled in.
4227          *
4228          * we cannot simply called update_context_time()
4229          * because of locking issue as we can be called in
4230          * NMI context
4231          */
4232         calc_timer_values(event, &now, &enabled, &running);
4233
4234         userpg = rb->user_page;
4235         /*
4236          * Disable preemption so as to not let the corresponding user-space
4237          * spin too long if we get preempted.
4238          */
4239         preempt_disable();
4240         ++userpg->lock;
4241         barrier();
4242         userpg->index = perf_event_index(event);
4243         userpg->offset = perf_event_count(event);
4244         if (userpg->index)
4245                 userpg->offset -= local64_read(&event->hw.prev_count);
4246
4247         userpg->time_enabled = enabled +
4248                         atomic64_read(&event->child_total_time_enabled);
4249
4250         userpg->time_running = running +
4251                         atomic64_read(&event->child_total_time_running);
4252
4253         arch_perf_update_userpage(event, userpg, now);
4254
4255         barrier();
4256         ++userpg->lock;
4257         preempt_enable();
4258 unlock:
4259         rcu_read_unlock();
4260 }
4261
4262 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4263 {
4264         struct perf_event *event = vma->vm_file->private_data;
4265         struct ring_buffer *rb;
4266         int ret = VM_FAULT_SIGBUS;
4267
4268         if (vmf->flags & FAULT_FLAG_MKWRITE) {
4269                 if (vmf->pgoff == 0)
4270                         ret = 0;
4271                 return ret;
4272         }
4273
4274         rcu_read_lock();
4275         rb = rcu_dereference(event->rb);
4276         if (!rb)
4277                 goto unlock;
4278
4279         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4280                 goto unlock;
4281
4282         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4283         if (!vmf->page)
4284                 goto unlock;
4285
4286         get_page(vmf->page);
4287         vmf->page->mapping = vma->vm_file->f_mapping;
4288         vmf->page->index   = vmf->pgoff;
4289
4290         ret = 0;
4291 unlock:
4292         rcu_read_unlock();
4293
4294         return ret;
4295 }
4296
4297 static void ring_buffer_attach(struct perf_event *event,
4298                                struct ring_buffer *rb)
4299 {
4300         struct ring_buffer *old_rb = NULL;
4301         unsigned long flags;
4302
4303         if (event->rb) {
4304                 /*
4305                  * Should be impossible, we set this when removing
4306                  * event->rb_entry and wait/clear when adding event->rb_entry.
4307                  */
4308                 WARN_ON_ONCE(event->rcu_pending);
4309
4310                 old_rb = event->rb;
4311                 event->rcu_batches = get_state_synchronize_rcu();
4312                 event->rcu_pending = 1;
4313
4314                 spin_lock_irqsave(&old_rb->event_lock, flags);
4315                 list_del_rcu(&event->rb_entry);
4316                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4317         }
4318
4319         if (event->rcu_pending && rb) {
4320                 cond_synchronize_rcu(event->rcu_batches);
4321                 event->rcu_pending = 0;
4322         }
4323
4324         if (rb) {
4325                 spin_lock_irqsave(&rb->event_lock, flags);
4326                 list_add_rcu(&event->rb_entry, &rb->event_list);
4327                 spin_unlock_irqrestore(&rb->event_lock, flags);
4328         }
4329
4330         rcu_assign_pointer(event->rb, rb);
4331
4332         if (old_rb) {
4333                 ring_buffer_put(old_rb);
4334                 /*
4335                  * Since we detached before setting the new rb, so that we
4336                  * could attach the new rb, we could have missed a wakeup.
4337                  * Provide it now.
4338                  */
4339                 wake_up_all(&event->waitq);
4340         }
4341 }
4342
4343 static void ring_buffer_wakeup(struct perf_event *event)
4344 {
4345         struct ring_buffer *rb;
4346
4347         rcu_read_lock();
4348         rb = rcu_dereference(event->rb);
4349         if (rb) {
4350                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4351                         wake_up_all(&event->waitq);
4352         }
4353         rcu_read_unlock();
4354 }
4355
4356 static void rb_free_rcu(struct rcu_head *rcu_head)
4357 {
4358         struct ring_buffer *rb;
4359
4360         rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4361         rb_free(rb);
4362 }
4363
4364 struct ring_buffer *ring_buffer_get(struct perf_event *event)
4365 {
4366         struct ring_buffer *rb;
4367
4368         rcu_read_lock();
4369         rb = rcu_dereference(event->rb);
4370         if (rb) {
4371                 if (!atomic_inc_not_zero(&rb->refcount))
4372                         rb = NULL;
4373         }
4374         rcu_read_unlock();
4375
4376         return rb;
4377 }
4378
4379 void ring_buffer_put(struct ring_buffer *rb)
4380 {
4381         if (!atomic_dec_and_test(&rb->refcount))
4382                 return;
4383
4384         WARN_ON_ONCE(!list_empty(&rb->event_list));
4385
4386         call_rcu(&rb->rcu_head, rb_free_rcu);
4387 }
4388
4389 static void perf_mmap_open(struct vm_area_struct *vma)
4390 {
4391         struct perf_event *event = vma->vm_file->private_data;
4392
4393         atomic_inc(&event->mmap_count);
4394         atomic_inc(&event->rb->mmap_count);
4395
4396         if (vma->vm_pgoff)
4397                 atomic_inc(&event->rb->aux_mmap_count);
4398
4399         if (event->pmu->event_mapped)
4400                 event->pmu->event_mapped(event);
4401 }
4402
4403 /*
4404  * A buffer can be mmap()ed multiple times; either directly through the same
4405  * event, or through other events by use of perf_event_set_output().
4406  *
4407  * In order to undo the VM accounting done by perf_mmap() we need to destroy
4408  * the buffer here, where we still have a VM context. This means we need
4409  * to detach all events redirecting to us.
4410  */
4411 static void perf_mmap_close(struct vm_area_struct *vma)
4412 {
4413         struct perf_event *event = vma->vm_file->private_data;
4414
4415         struct ring_buffer *rb = ring_buffer_get(event);
4416         struct user_struct *mmap_user = rb->mmap_user;
4417         int mmap_locked = rb->mmap_locked;
4418         unsigned long size = perf_data_size(rb);
4419
4420         if (event->pmu->event_unmapped)
4421                 event->pmu->event_unmapped(event);
4422
4423         /*
4424          * rb->aux_mmap_count will always drop before rb->mmap_count and
4425          * event->mmap_count, so it is ok to use event->mmap_mutex to
4426          * serialize with perf_mmap here.
4427          */
4428         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4429             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4430                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4431                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4432
4433                 rb_free_aux(rb);
4434                 mutex_unlock(&event->mmap_mutex);
4435         }
4436
4437         atomic_dec(&rb->mmap_count);
4438
4439         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4440                 goto out_put;
4441
4442         ring_buffer_attach(event, NULL);
4443         mutex_unlock(&event->mmap_mutex);
4444
4445         /* If there's still other mmap()s of this buffer, we're done. */
4446         if (atomic_read(&rb->mmap_count))
4447                 goto out_put;
4448
4449         /*
4450          * No other mmap()s, detach from all other events that might redirect
4451          * into the now unreachable buffer. Somewhat complicated by the
4452          * fact that rb::event_lock otherwise nests inside mmap_mutex.
4453          */
4454 again:
4455         rcu_read_lock();
4456         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4457                 if (!atomic_long_inc_not_zero(&event->refcount)) {
4458                         /*
4459                          * This event is en-route to free_event() which will
4460                          * detach it and remove it from the list.
4461                          */
4462                         continue;
4463                 }
4464                 rcu_read_unlock();
4465
4466                 mutex_lock(&event->mmap_mutex);
4467                 /*
4468                  * Check we didn't race with perf_event_set_output() which can
4469                  * swizzle the rb from under us while we were waiting to
4470                  * acquire mmap_mutex.
4471                  *
4472                  * If we find a different rb; ignore this event, a next
4473                  * iteration will no longer find it on the list. We have to
4474                  * still restart the iteration to make sure we're not now
4475                  * iterating the wrong list.
4476                  */
4477                 if (event->rb == rb)
4478                         ring_buffer_attach(event, NULL);
4479
4480                 mutex_unlock(&event->mmap_mutex);
4481                 put_event(event);
4482
4483                 /*
4484                  * Restart the iteration; either we're on the wrong list or
4485                  * destroyed its integrity by doing a deletion.
4486                  */
4487                 goto again;
4488         }
4489         rcu_read_unlock();
4490
4491         /*
4492          * It could be there's still a few 0-ref events on the list; they'll
4493          * get cleaned up by free_event() -- they'll also still have their
4494          * ref on the rb and will free it whenever they are done with it.
4495          *
4496          * Aside from that, this buffer is 'fully' detached and unmapped,
4497          * undo the VM accounting.
4498          */
4499
4500         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4501         vma->vm_mm->pinned_vm -= mmap_locked;
4502         free_uid(mmap_user);
4503
4504 out_put:
4505         ring_buffer_put(rb); /* could be last */
4506 }
4507
4508 static const struct vm_operations_struct perf_mmap_vmops = {
4509         .open           = perf_mmap_open,
4510         .close          = perf_mmap_close, /* non mergable */
4511         .fault          = perf_mmap_fault,
4512         .page_mkwrite   = perf_mmap_fault,
4513 };
4514
4515 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4516 {
4517         struct perf_event *event = file->private_data;
4518         unsigned long user_locked, user_lock_limit;
4519         struct user_struct *user = current_user();
4520         unsigned long locked, lock_limit;
4521         struct ring_buffer *rb = NULL;
4522         unsigned long vma_size;
4523         unsigned long nr_pages;
4524         long user_extra = 0, extra = 0;
4525         int ret = 0, flags = 0;
4526
4527         /*
4528          * Don't allow mmap() of inherited per-task counters. This would
4529          * create a performance issue due to all children writing to the
4530          * same rb.
4531          */
4532         if (event->cpu == -1 && event->attr.inherit)
4533                 return -EINVAL;
4534
4535         if (!(vma->vm_flags & VM_SHARED))
4536                 return -EINVAL;
4537
4538         vma_size = vma->vm_end - vma->vm_start;
4539
4540         if (vma->vm_pgoff == 0) {
4541                 nr_pages = (vma_size / PAGE_SIZE) - 1;
4542         } else {
4543                 /*
4544                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4545                  * mapped, all subsequent mappings should have the same size
4546                  * and offset. Must be above the normal perf buffer.
4547                  */
4548                 u64 aux_offset, aux_size;
4549
4550                 if (!event->rb)
4551                         return -EINVAL;
4552
4553                 nr_pages = vma_size / PAGE_SIZE;
4554
4555                 mutex_lock(&event->mmap_mutex);
4556                 ret = -EINVAL;
4557
4558                 rb = event->rb;
4559                 if (!rb)
4560                         goto aux_unlock;
4561
4562                 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4563                 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4564
4565                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4566                         goto aux_unlock;
4567
4568                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4569                         goto aux_unlock;
4570
4571                 /* already mapped with a different offset */
4572                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4573                         goto aux_unlock;
4574
4575                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4576                         goto aux_unlock;
4577
4578                 /* already mapped with a different size */
4579                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4580                         goto aux_unlock;
4581
4582                 if (!is_power_of_2(nr_pages))
4583                         goto aux_unlock;
4584
4585                 if (!atomic_inc_not_zero(&rb->mmap_count))
4586                         goto aux_unlock;
4587
4588                 if (rb_has_aux(rb)) {
4589                         atomic_inc(&rb->aux_mmap_count);
4590                         ret = 0;
4591                         goto unlock;
4592                 }
4593
4594                 atomic_set(&rb->aux_mmap_count, 1);
4595                 user_extra = nr_pages;
4596
4597                 goto accounting;
4598         }
4599
4600         /*
4601          * If we have rb pages ensure they're a power-of-two number, so we
4602          * can do bitmasks instead of modulo.
4603          */
4604         if (nr_pages != 0 && !is_power_of_2(nr_pages))
4605                 return -EINVAL;
4606
4607         if (vma_size != PAGE_SIZE * (1 + nr_pages))
4608                 return -EINVAL;
4609
4610         WARN_ON_ONCE(event->ctx->parent_ctx);
4611 again:
4612         mutex_lock(&event->mmap_mutex);
4613         if (event->rb) {
4614                 if (event->rb->nr_pages != nr_pages) {
4615                         ret = -EINVAL;
4616                         goto unlock;
4617                 }
4618
4619                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4620                         /*
4621                          * Raced against perf_mmap_close() through
4622                          * perf_event_set_output(). Try again, hope for better
4623                          * luck.
4624                          */
4625                         mutex_unlock(&event->mmap_mutex);
4626                         goto again;
4627                 }
4628
4629                 goto unlock;
4630         }
4631
4632         user_extra = nr_pages + 1;
4633
4634 accounting:
4635         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4636
4637         /*
4638          * Increase the limit linearly with more CPUs:
4639          */
4640         user_lock_limit *= num_online_cpus();
4641
4642         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4643
4644         if (user_locked > user_lock_limit)
4645                 extra = user_locked - user_lock_limit;
4646
4647         lock_limit = rlimit(RLIMIT_MEMLOCK);
4648         lock_limit >>= PAGE_SHIFT;
4649         locked = vma->vm_mm->pinned_vm + extra;
4650
4651         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4652                 !capable(CAP_IPC_LOCK)) {
4653                 ret = -EPERM;
4654                 goto unlock;
4655         }
4656
4657         WARN_ON(!rb && event->rb);
4658
4659         if (vma->vm_flags & VM_WRITE)
4660                 flags |= RING_BUFFER_WRITABLE;
4661
4662         if (!rb) {
4663                 rb = rb_alloc(nr_pages,
4664                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
4665                               event->cpu, flags);
4666
4667                 if (!rb) {
4668                         ret = -ENOMEM;
4669                         goto unlock;
4670                 }
4671
4672                 atomic_set(&rb->mmap_count, 1);
4673                 rb->mmap_user = get_current_user();
4674                 rb->mmap_locked = extra;
4675
4676                 ring_buffer_attach(event, rb);
4677
4678                 perf_event_init_userpage(event);
4679                 perf_event_update_userpage(event);
4680         } else {
4681                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4682                                    event->attr.aux_watermark, flags);
4683                 if (!ret)
4684                         rb->aux_mmap_locked = extra;
4685         }
4686
4687 unlock:
4688         if (!ret) {
4689                 atomic_long_add(user_extra, &user->locked_vm);
4690                 vma->vm_mm->pinned_vm += extra;
4691
4692                 atomic_inc(&event->mmap_count);
4693         } else if (rb) {
4694                 atomic_dec(&rb->mmap_count);
4695         }
4696 aux_unlock:
4697         mutex_unlock(&event->mmap_mutex);
4698
4699         /*
4700          * Since pinned accounting is per vm we cannot allow fork() to copy our
4701          * vma.
4702          */
4703         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4704         vma->vm_ops = &perf_mmap_vmops;
4705
4706         if (event->pmu->event_mapped)
4707                 event->pmu->event_mapped(event);
4708
4709         return ret;
4710 }
4711
4712 static int perf_fasync(int fd, struct file *filp, int on)
4713 {
4714         struct inode *inode = file_inode(filp);
4715         struct perf_event *event = filp->private_data;
4716         int retval;
4717
4718         mutex_lock(&inode->i_mutex);
4719         retval = fasync_helper(fd, filp, on, &event->fasync);
4720         mutex_unlock(&inode->i_mutex);
4721
4722         if (retval < 0)
4723                 return retval;
4724
4725         return 0;
4726 }
4727
4728 static const struct file_operations perf_fops = {
4729         .llseek                 = no_llseek,
4730         .release                = perf_release,
4731         .read                   = perf_read,
4732         .poll                   = perf_poll,
4733         .unlocked_ioctl         = perf_ioctl,
4734         .compat_ioctl           = perf_compat_ioctl,
4735         .mmap                   = perf_mmap,
4736         .fasync                 = perf_fasync,
4737 };
4738
4739 /*
4740  * Perf event wakeup
4741  *
4742  * If there's data, ensure we set the poll() state and publish everything
4743  * to user-space before waking everybody up.
4744  */
4745
4746 void perf_event_wakeup(struct perf_event *event)
4747 {
4748         ring_buffer_wakeup(event);
4749
4750         if (event->pending_kill) {
4751                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4752                 event->pending_kill = 0;
4753         }
4754 }
4755
4756 static void perf_pending_event(struct irq_work *entry)
4757 {
4758         struct perf_event *event = container_of(entry,
4759                         struct perf_event, pending);
4760         int rctx;
4761
4762         rctx = perf_swevent_get_recursion_context();
4763         /*
4764          * If we 'fail' here, that's OK, it means recursion is already disabled
4765          * and we won't recurse 'further'.
4766          */
4767
4768         if (event->pending_disable) {
4769                 event->pending_disable = 0;
4770                 __perf_event_disable(event);
4771         }
4772
4773         if (event->pending_wakeup) {
4774                 event->pending_wakeup = 0;
4775                 perf_event_wakeup(event);
4776         }
4777
4778         if (rctx >= 0)
4779                 perf_swevent_put_recursion_context(rctx);
4780 }
4781
4782 /*
4783  * We assume there is only KVM supporting the callbacks.
4784  * Later on, we might change it to a list if there is
4785  * another virtualization implementation supporting the callbacks.
4786  */
4787 struct perf_guest_info_callbacks *perf_guest_cbs;
4788
4789 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4790 {
4791         perf_guest_cbs = cbs;
4792         return 0;
4793 }
4794 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4795
4796 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4797 {
4798         perf_guest_cbs = NULL;
4799         return 0;
4800 }
4801 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4802
4803 static void
4804 perf_output_sample_regs(struct perf_output_handle *handle,
4805                         struct pt_regs *regs, u64 mask)
4806 {
4807         int bit;
4808
4809         for_each_set_bit(bit, (const unsigned long *) &mask,
4810                          sizeof(mask) * BITS_PER_BYTE) {
4811                 u64 val;
4812
4813                 val = perf_reg_value(regs, bit);
4814                 perf_output_put(handle, val);
4815         }
4816 }
4817
4818 static void perf_sample_regs_user(struct perf_regs *regs_user,
4819                                   struct pt_regs *regs,
4820                                   struct pt_regs *regs_user_copy)
4821 {
4822         if (user_mode(regs)) {
4823                 regs_user->abi = perf_reg_abi(current);
4824                 regs_user->regs = regs;
4825         } else if (current->mm) {
4826                 perf_get_regs_user(regs_user, regs, regs_user_copy);
4827         } else {
4828                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4829                 regs_user->regs = NULL;
4830         }
4831 }
4832
4833 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4834                                   struct pt_regs *regs)
4835 {
4836         regs_intr->regs = regs;
4837         regs_intr->abi  = perf_reg_abi(current);
4838 }
4839
4840
4841 /*
4842  * Get remaining task size from user stack pointer.
4843  *
4844  * It'd be better to take stack vma map and limit this more
4845  * precisly, but there's no way to get it safely under interrupt,
4846  * so using TASK_SIZE as limit.
4847  */
4848 static u64 perf_ustack_task_size(struct pt_regs *regs)
4849 {
4850         unsigned long addr = perf_user_stack_pointer(regs);
4851
4852         if (!addr || addr >= TASK_SIZE)
4853                 return 0;
4854
4855         return TASK_SIZE - addr;
4856 }
4857
4858 static u16
4859 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4860                         struct pt_regs *regs)
4861 {
4862         u64 task_size;
4863
4864         /* No regs, no stack pointer, no dump. */
4865         if (!regs)
4866                 return 0;
4867
4868         /*
4869          * Check if we fit in with the requested stack size into the:
4870          * - TASK_SIZE
4871          *   If we don't, we limit the size to the TASK_SIZE.
4872          *
4873          * - remaining sample size
4874          *   If we don't, we customize the stack size to
4875          *   fit in to the remaining sample size.
4876          */
4877
4878         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4879         stack_size = min(stack_size, (u16) task_size);
4880
4881         /* Current header size plus static size and dynamic size. */
4882         header_size += 2 * sizeof(u64);
4883
4884         /* Do we fit in with the current stack dump size? */
4885         if ((u16) (header_size + stack_size) < header_size) {
4886                 /*
4887                  * If we overflow the maximum size for the sample,
4888                  * we customize the stack dump size to fit in.
4889                  */
4890                 stack_size = USHRT_MAX - header_size - sizeof(u64);
4891                 stack_size = round_up(stack_size, sizeof(u64));
4892         }
4893
4894         return stack_size;
4895 }
4896
4897 static void
4898 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4899                           struct pt_regs *regs)
4900 {
4901         /* Case of a kernel thread, nothing to dump */
4902         if (!regs) {
4903                 u64 size = 0;
4904                 perf_output_put(handle, size);
4905         } else {
4906                 unsigned long sp;
4907                 unsigned int rem;
4908                 u64 dyn_size;
4909
4910                 /*
4911                  * We dump:
4912                  * static size
4913                  *   - the size requested by user or the best one we can fit
4914                  *     in to the sample max size
4915                  * data
4916                  *   - user stack dump data
4917                  * dynamic size
4918                  *   - the actual dumped size
4919                  */
4920
4921                 /* Static size. */
4922                 perf_output_put(handle, dump_size);
4923
4924                 /* Data. */
4925                 sp = perf_user_stack_pointer(regs);
4926                 rem = __output_copy_user(handle, (void *) sp, dump_size);
4927                 dyn_size = dump_size - rem;
4928
4929                 perf_output_skip(handle, rem);
4930
4931                 /* Dynamic size. */
4932                 perf_output_put(handle, dyn_size);
4933         }
4934 }
4935
4936 static void __perf_event_header__init_id(struct perf_event_header *header,
4937                                          struct perf_sample_data *data,
4938                                          struct perf_event *event)
4939 {
4940         u64 sample_type = event->attr.sample_type;
4941
4942         data->type = sample_type;
4943         header->size += event->id_header_size;
4944
4945         if (sample_type & PERF_SAMPLE_TID) {
4946                 /* namespace issues */
4947                 data->tid_entry.pid = perf_event_pid(event, current);
4948                 data->tid_entry.tid = perf_event_tid(event, current);
4949         }
4950
4951         if (sample_type & PERF_SAMPLE_TIME)
4952                 data->time = perf_event_clock(event);
4953
4954         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4955                 data->id = primary_event_id(event);
4956
4957         if (sample_type & PERF_SAMPLE_STREAM_ID)
4958                 data->stream_id = event->id;
4959
4960         if (sample_type & PERF_SAMPLE_CPU) {
4961                 data->cpu_entry.cpu      = raw_smp_processor_id();
4962                 data->cpu_entry.reserved = 0;
4963         }
4964 }
4965
4966 void perf_event_header__init_id(struct perf_event_header *header,
4967                                 struct perf_sample_data *data,
4968                                 struct perf_event *event)
4969 {
4970         if (event->attr.sample_id_all)
4971                 __perf_event_header__init_id(header, data, event);
4972 }
4973
4974 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4975                                            struct perf_sample_data *data)
4976 {
4977         u64 sample_type = data->type;
4978
4979         if (sample_type & PERF_SAMPLE_TID)
4980                 perf_output_put(handle, data->tid_entry);
4981
4982         if (sample_type & PERF_SAMPLE_TIME)
4983                 perf_output_put(handle, data->time);
4984
4985         if (sample_type & PERF_SAMPLE_ID)
4986                 perf_output_put(handle, data->id);
4987
4988         if (sample_type & PERF_SAMPLE_STREAM_ID)
4989                 perf_output_put(handle, data->stream_id);
4990
4991         if (sample_type & PERF_SAMPLE_CPU)
4992                 perf_output_put(handle, data->cpu_entry);
4993
4994         if (sample_type & PERF_SAMPLE_IDENTIFIER)
4995                 perf_output_put(handle, data->id);
4996 }
4997
4998 void perf_event__output_id_sample(struct perf_event *event,
4999                                   struct perf_output_handle *handle,
5000                                   struct perf_sample_data *sample)
5001 {
5002         if (event->attr.sample_id_all)
5003                 __perf_event__output_id_sample(handle, sample);
5004 }
5005
5006 static void perf_output_read_one(struct perf_output_handle *handle,
5007                                  struct perf_event *event,
5008                                  u64 enabled, u64 running)
5009 {
5010         u64 read_format = event->attr.read_format;
5011         u64 values[4];
5012         int n = 0;
5013
5014         values[n++] = perf_event_count(event);
5015         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5016                 values[n++] = enabled +
5017                         atomic64_read(&event->child_total_time_enabled);
5018         }
5019         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5020                 values[n++] = running +
5021                         atomic64_read(&event->child_total_time_running);
5022         }
5023         if (read_format & PERF_FORMAT_ID)
5024                 values[n++] = primary_event_id(event);
5025
5026         __output_copy(handle, values, n * sizeof(u64));
5027 }
5028
5029 /*
5030  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5031  */
5032 static void perf_output_read_group(struct perf_output_handle *handle,
5033                             struct perf_event *event,
5034                             u64 enabled, u64 running)
5035 {
5036         struct perf_event *leader = event->group_leader, *sub;
5037         u64 read_format = event->attr.read_format;
5038         u64 values[5];
5039         int n = 0;
5040
5041         values[n++] = 1 + leader->nr_siblings;
5042
5043         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5044                 values[n++] = enabled;
5045
5046         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5047                 values[n++] = running;
5048
5049         if (leader != event)
5050                 leader->pmu->read(leader);
5051
5052         values[n++] = perf_event_count(leader);
5053         if (read_format & PERF_FORMAT_ID)
5054                 values[n++] = primary_event_id(leader);
5055
5056         __output_copy(handle, values, n * sizeof(u64));
5057
5058         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5059                 n = 0;
5060
5061                 if ((sub != event) &&
5062                     (sub->state == PERF_EVENT_STATE_ACTIVE))
5063                         sub->pmu->read(sub);
5064
5065                 values[n++] = perf_event_count(sub);
5066                 if (read_format & PERF_FORMAT_ID)
5067                         values[n++] = primary_event_id(sub);
5068
5069                 __output_copy(handle, values, n * sizeof(u64));
5070         }
5071 }
5072
5073 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5074                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
5075
5076 static void perf_output_read(struct perf_output_handle *handle,
5077                              struct perf_event *event)
5078 {
5079         u64 enabled = 0, running = 0, now;
5080         u64 read_format = event->attr.read_format;
5081
5082         /*
5083          * compute total_time_enabled, total_time_running
5084          * based on snapshot values taken when the event
5085          * was last scheduled in.
5086          *
5087          * we cannot simply called update_context_time()
5088          * because of locking issue as we are called in
5089          * NMI context
5090          */
5091         if (read_format & PERF_FORMAT_TOTAL_TIMES)
5092                 calc_timer_values(event, &now, &enabled, &running);
5093
5094         if (event->attr.read_format & PERF_FORMAT_GROUP)
5095                 perf_output_read_group(handle, event, enabled, running);
5096         else
5097                 perf_output_read_one(handle, event, enabled, running);
5098 }
5099
5100 void perf_output_sample(struct perf_output_handle *handle,
5101                         struct perf_event_header *header,
5102                         struct perf_sample_data *data,
5103                         struct perf_event *event)
5104 {
5105         u64 sample_type = data->type;
5106
5107         perf_output_put(handle, *header);
5108
5109         if (sample_type & PERF_SAMPLE_IDENTIFIER)
5110                 perf_output_put(handle, data->id);
5111
5112         if (sample_type & PERF_SAMPLE_IP)
5113                 perf_output_put(handle, data->ip);
5114
5115         if (sample_type & PERF_SAMPLE_TID)
5116                 perf_output_put(handle, data->tid_entry);
5117
5118         if (sample_type & PERF_SAMPLE_TIME)
5119                 perf_output_put(handle, data->time);
5120
5121         if (sample_type & PERF_SAMPLE_ADDR)
5122                 perf_output_put(handle, data->addr);
5123
5124         if (sample_type & PERF_SAMPLE_ID)
5125                 perf_output_put(handle, data->id);
5126
5127         if (sample_type & PERF_SAMPLE_STREAM_ID)
5128                 perf_output_put(handle, data->stream_id);
5129
5130         if (sample_type & PERF_SAMPLE_CPU)
5131                 perf_output_put(handle, data->cpu_entry);
5132
5133         if (sample_type & PERF_SAMPLE_PERIOD)
5134                 perf_output_put(handle, data->period);
5135
5136         if (sample_type & PERF_SAMPLE_READ)
5137                 perf_output_read(handle, event);
5138
5139         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5140                 if (data->callchain) {
5141                         int size = 1;
5142
5143                         if (data->callchain)
5144                                 size += data->callchain->nr;
5145
5146                         size *= sizeof(u64);
5147
5148                         __output_copy(handle, data->callchain, size);
5149                 } else {
5150                         u64 nr = 0;
5151                         perf_output_put(handle, nr);
5152                 }
5153         }
5154
5155         if (sample_type & PERF_SAMPLE_RAW) {
5156                 if (data->raw) {
5157                         perf_output_put(handle, data->raw->size);
5158                         __output_copy(handle, data->raw->data,
5159                                            data->raw->size);
5160                 } else {
5161                         struct {
5162                                 u32     size;
5163                                 u32     data;
5164                         } raw = {
5165                                 .size = sizeof(u32),
5166                                 .data = 0,
5167                         };
5168                         perf_output_put(handle, raw);
5169                 }
5170         }
5171
5172         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5173                 if (data->br_stack) {
5174                         size_t size;
5175
5176                         size = data->br_stack->nr
5177                              * sizeof(struct perf_branch_entry);
5178
5179                         perf_output_put(handle, data->br_stack->nr);
5180                         perf_output_copy(handle, data->br_stack->entries, size);
5181                 } else {
5182                         /*
5183                          * we always store at least the value of nr
5184                          */
5185                         u64 nr = 0;
5186                         perf_output_put(handle, nr);
5187                 }
5188         }
5189
5190         if (sample_type & PERF_SAMPLE_REGS_USER) {
5191                 u64 abi = data->regs_user.abi;
5192
5193                 /*
5194                  * If there are no regs to dump, notice it through
5195                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5196                  */
5197                 perf_output_put(handle, abi);
5198
5199                 if (abi) {
5200                         u64 mask = event->attr.sample_regs_user;
5201                         perf_output_sample_regs(handle,
5202                                                 data->regs_user.regs,
5203                                                 mask);
5204                 }
5205         }
5206
5207         if (sample_type & PERF_SAMPLE_STACK_USER) {
5208                 perf_output_sample_ustack(handle,
5209                                           data->stack_user_size,
5210                                           data->regs_user.regs);
5211         }
5212
5213         if (sample_type & PERF_SAMPLE_WEIGHT)
5214                 perf_output_put(handle, data->weight);
5215
5216         if (sample_type & PERF_SAMPLE_DATA_SRC)
5217                 perf_output_put(handle, data->data_src.val);
5218
5219         if (sample_type & PERF_SAMPLE_TRANSACTION)
5220                 perf_output_put(handle, data->txn);
5221
5222         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5223                 u64 abi = data->regs_intr.abi;
5224                 /*
5225                  * If there are no regs to dump, notice it through
5226                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5227                  */
5228                 perf_output_put(handle, abi);
5229
5230                 if (abi) {
5231                         u64 mask = event->attr.sample_regs_intr;
5232
5233                         perf_output_sample_regs(handle,
5234                                                 data->regs_intr.regs,
5235                                                 mask);
5236                 }
5237         }
5238
5239         if (!event->attr.watermark) {
5240                 int wakeup_events = event->attr.wakeup_events;
5241
5242                 if (wakeup_events) {
5243                         struct ring_buffer *rb = handle->rb;
5244                         int events = local_inc_return(&rb->events);
5245
5246                         if (events >= wakeup_events) {
5247                                 local_sub(wakeup_events, &rb->events);
5248                                 local_inc(&rb->wakeup);
5249                         }
5250                 }
5251         }
5252 }
5253
5254 void perf_prepare_sample(struct perf_event_header *header,
5255                          struct perf_sample_data *data,
5256                          struct perf_event *event,
5257                          struct pt_regs *regs)
5258 {
5259         u64 sample_type = event->attr.sample_type;
5260
5261         header->type = PERF_RECORD_SAMPLE;
5262         header->size = sizeof(*header) + event->header_size;
5263
5264         header->misc = 0;
5265         header->misc |= perf_misc_flags(regs);
5266
5267         __perf_event_header__init_id(header, data, event);
5268
5269         if (sample_type & PERF_SAMPLE_IP)
5270                 data->ip = perf_instruction_pointer(regs);
5271
5272         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5273                 int size = 1;
5274
5275                 data->callchain = perf_callchain(event, regs);
5276
5277                 if (data->callchain)
5278                         size += data->callchain->nr;
5279
5280                 header->size += size * sizeof(u64);
5281         }
5282
5283         if (sample_type & PERF_SAMPLE_RAW) {
5284                 int size = sizeof(u32);
5285
5286                 if (data->raw)
5287                         size += data->raw->size;
5288                 else
5289                         size += sizeof(u32);
5290
5291                 WARN_ON_ONCE(size & (sizeof(u64)-1));
5292                 header->size += size;
5293         }
5294
5295         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5296                 int size = sizeof(u64); /* nr */
5297                 if (data->br_stack) {
5298                         size += data->br_stack->nr
5299                               * sizeof(struct perf_branch_entry);
5300                 }
5301                 header->size += size;
5302         }
5303
5304         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5305                 perf_sample_regs_user(&data->regs_user, regs,
5306                                       &data->regs_user_copy);
5307
5308         if (sample_type & PERF_SAMPLE_REGS_USER) {
5309                 /* regs dump ABI info */
5310                 int size = sizeof(u64);
5311
5312                 if (data->regs_user.regs) {
5313                         u64 mask = event->attr.sample_regs_user;
5314                         size += hweight64(mask) * sizeof(u64);
5315                 }
5316
5317                 header->size += size;
5318         }
5319
5320         if (sample_type & PERF_SAMPLE_STACK_USER) {
5321                 /*
5322                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5323                  * processed as the last one or have additional check added
5324                  * in case new sample type is added, because we could eat
5325                  * up the rest of the sample size.
5326                  */
5327                 u16 stack_size = event->attr.sample_stack_user;
5328                 u16 size = sizeof(u64);
5329
5330                 stack_size = perf_sample_ustack_size(stack_size, header->size,
5331                                                      data->regs_user.regs);
5332
5333                 /*
5334                  * If there is something to dump, add space for the dump
5335                  * itself and for the field that tells the dynamic size,
5336                  * which is how many have been actually dumped.
5337                  */
5338                 if (stack_size)
5339                         size += sizeof(u64) + stack_size;
5340
5341                 data->stack_user_size = stack_size;
5342                 header->size += size;
5343         }
5344
5345         if (sample_type & PERF_SAMPLE_REGS_INTR) {
5346                 /* regs dump ABI info */
5347                 int size = sizeof(u64);
5348
5349                 perf_sample_regs_intr(&data->regs_intr, regs);
5350
5351                 if (data->regs_intr.regs) {
5352                         u64 mask = event->attr.sample_regs_intr;
5353
5354                         size += hweight64(mask) * sizeof(u64);
5355                 }
5356
5357                 header->size += size;
5358         }
5359 }
5360
5361 static void perf_event_output(struct perf_event *event,
5362                                 struct perf_sample_data *data,
5363                                 struct pt_regs *regs)
5364 {
5365         struct perf_output_handle handle;
5366         struct perf_event_header header;
5367
5368         /* protect the callchain buffers */
5369         rcu_read_lock();
5370
5371         perf_prepare_sample(&header, data, event, regs);
5372
5373         if (perf_output_begin(&handle, event, header.size))
5374                 goto exit;
5375
5376         perf_output_sample(&handle, &header, data, event);
5377
5378         perf_output_end(&handle);
5379
5380 exit:
5381         rcu_read_unlock();
5382 }
5383
5384 /*
5385  * read event_id
5386  */
5387
5388 struct perf_read_event {
5389         struct perf_event_header        header;
5390
5391         u32                             pid;
5392         u32                             tid;
5393 };
5394
5395 static void
5396 perf_event_read_event(struct perf_event *event,
5397                         struct task_struct *task)
5398 {
5399         struct perf_output_handle handle;
5400         struct perf_sample_data sample;
5401         struct perf_read_event read_event = {
5402                 .header = {
5403                         .type = PERF_RECORD_READ,
5404                         .misc = 0,
5405                         .size = sizeof(read_event) + event->read_size,
5406                 },
5407                 .pid = perf_event_pid(event, task),
5408                 .tid = perf_event_tid(event, task),
5409         };
5410         int ret;
5411
5412         perf_event_header__init_id(&read_event.header, &sample, event);
5413         ret = perf_output_begin(&handle, event, read_event.header.size);
5414         if (ret)
5415                 return;
5416
5417         perf_output_put(&handle, read_event);
5418         perf_output_read(&handle, event);
5419         perf_event__output_id_sample(event, &handle, &sample);
5420
5421         perf_output_end(&handle);
5422 }
5423
5424 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5425
5426 static void
5427 perf_event_aux_ctx(struct perf_event_context *ctx,
5428                    perf_event_aux_output_cb output,
5429                    void *data)
5430 {
5431         struct perf_event *event;
5432
5433         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5434                 if (event->state < PERF_EVENT_STATE_INACTIVE)
5435                         continue;
5436                 if (!event_filter_match(event))
5437                         continue;
5438                 output(event, data);
5439         }
5440 }
5441
5442 static void
5443 perf_event_aux(perf_event_aux_output_cb output, void *data,
5444                struct perf_event_context *task_ctx)
5445 {
5446         struct perf_cpu_context *cpuctx;
5447         struct perf_event_context *ctx;
5448         struct pmu *pmu;
5449         int ctxn;
5450
5451         rcu_read_lock();
5452         list_for_each_entry_rcu(pmu, &pmus, entry) {
5453                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5454                 if (cpuctx->unique_pmu != pmu)
5455                         goto next;
5456                 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5457                 if (task_ctx)
5458                         goto next;
5459                 ctxn = pmu->task_ctx_nr;
5460                 if (ctxn < 0)
5461                         goto next;
5462                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5463                 if (ctx)
5464                         perf_event_aux_ctx(ctx, output, data);
5465 next:
5466                 put_cpu_ptr(pmu->pmu_cpu_context);
5467         }
5468
5469         if (task_ctx) {
5470                 preempt_disable();
5471                 perf_event_aux_ctx(task_ctx, output, data);
5472                 preempt_enable();
5473         }
5474         rcu_read_unlock();
5475 }
5476
5477 /*
5478  * task tracking -- fork/exit
5479  *
5480  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5481  */
5482
5483 struct perf_task_event {
5484         struct task_struct              *task;
5485         struct perf_event_context       *task_ctx;
5486
5487         struct {
5488                 struct perf_event_header        header;
5489
5490                 u32                             pid;
5491                 u32                             ppid;
5492                 u32                             tid;
5493                 u32                             ptid;
5494                 u64                             time;
5495         } event_id;
5496 };
5497
5498 static int perf_event_task_match(struct perf_event *event)
5499 {
5500         return event->attr.comm  || event->attr.mmap ||
5501                event->attr.mmap2 || event->attr.mmap_data ||
5502                event->attr.task;
5503 }
5504
5505 static void perf_event_task_output(struct perf_event *event,
5506                                    void *data)
5507 {
5508         struct perf_task_event *task_event = data;
5509         struct perf_output_handle handle;
5510         struct perf_sample_data sample;
5511         struct task_struct *task = task_event->task;
5512         int ret, size = task_event->event_id.header.size;
5513
5514         if (!perf_event_task_match(event))
5515                 return;
5516
5517         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5518
5519         ret = perf_output_begin(&handle, event,
5520                                 task_event->event_id.header.size);
5521         if (ret)
5522                 goto out;
5523
5524         task_event->event_id.pid = perf_event_pid(event, task);
5525         task_event->event_id.ppid = perf_event_pid(event, current);
5526
5527         task_event->event_id.tid = perf_event_tid(event, task);
5528         task_event->event_id.ptid = perf_event_tid(event, current);
5529
5530         task_event->event_id.time = perf_event_clock(event);
5531
5532         perf_output_put(&handle, task_event->event_id);
5533
5534         perf_event__output_id_sample(event, &handle, &sample);
5535
5536         perf_output_end(&handle);
5537 out:
5538         task_event->event_id.header.size = size;
5539 }
5540
5541 static void perf_event_task(struct task_struct *task,
5542                               struct perf_event_context *task_ctx,
5543                               int new)
5544 {
5545         struct perf_task_event task_event;
5546
5547         if (!atomic_read(&nr_comm_events) &&
5548             !atomic_read(&nr_mmap_events) &&
5549             !atomic_read(&nr_task_events))
5550                 return;
5551
5552         task_event = (struct perf_task_event){
5553                 .task     = task,
5554                 .task_ctx = task_ctx,
5555                 .event_id    = {
5556                         .header = {
5557                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5558                                 .misc = 0,
5559                                 .size = sizeof(task_event.event_id),
5560                         },
5561                         /* .pid  */
5562                         /* .ppid */
5563                         /* .tid  */
5564                         /* .ptid */
5565                         /* .time */
5566                 },
5567         };
5568
5569         perf_event_aux(perf_event_task_output,
5570                        &task_event,
5571                        task_ctx);
5572 }
5573
5574 void perf_event_fork(struct task_struct *task)
5575 {
5576         perf_event_task(task, NULL, 1);
5577 }
5578
5579 /*
5580  * comm tracking
5581  */
5582
5583 struct perf_comm_event {
5584         struct task_struct      *task;
5585         char                    *comm;
5586         int                     comm_size;
5587
5588         struct {
5589                 struct perf_event_header        header;
5590
5591                 u32                             pid;
5592                 u32                             tid;
5593         } event_id;
5594 };
5595
5596 static int perf_event_comm_match(struct perf_event *event)
5597 {
5598         return event->attr.comm;
5599 }
5600
5601 static void perf_event_comm_output(struct perf_event *event,
5602                                    void *data)
5603 {
5604         struct perf_comm_event *comm_event = data;
5605         struct perf_output_handle handle;
5606         struct perf_sample_data sample;
5607         int size = comm_event->event_id.header.size;
5608         int ret;
5609
5610         if (!perf_event_comm_match(event))
5611                 return;
5612
5613         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5614         ret = perf_output_begin(&handle, event,
5615                                 comm_event->event_id.header.size);
5616
5617         if (ret)
5618                 goto out;
5619
5620         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5621         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5622
5623         perf_output_put(&handle, comm_event->event_id);
5624         __output_copy(&handle, comm_event->comm,
5625                                    comm_event->comm_size);
5626
5627         perf_event__output_id_sample(event, &handle, &sample);
5628
5629         perf_output_end(&handle);
5630 out:
5631         comm_event->event_id.header.size = size;
5632 }
5633
5634 static void perf_event_comm_event(struct perf_comm_event *comm_event)
5635 {
5636         char comm[TASK_COMM_LEN];
5637         unsigned int size;
5638
5639         memset(comm, 0, sizeof(comm));
5640         strlcpy(comm, comm_event->task->comm, sizeof(comm));
5641         size = ALIGN(strlen(comm)+1, sizeof(u64));
5642
5643         comm_event->comm = comm;
5644         comm_event->comm_size = size;
5645
5646         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5647
5648         perf_event_aux(perf_event_comm_output,
5649                        comm_event,
5650                        NULL);
5651 }
5652
5653 void perf_event_comm(struct task_struct *task, bool exec)
5654 {
5655         struct perf_comm_event comm_event;
5656
5657         if (!atomic_read(&nr_comm_events))
5658                 return;
5659
5660         comm_event = (struct perf_comm_event){
5661                 .task   = task,
5662                 /* .comm      */
5663                 /* .comm_size */
5664                 .event_id  = {
5665                         .header = {
5666                                 .type = PERF_RECORD_COMM,
5667                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5668                                 /* .size */
5669                         },
5670                         /* .pid */
5671                         /* .tid */
5672                 },
5673         };
5674
5675         perf_event_comm_event(&comm_event);
5676 }
5677
5678 /*
5679  * mmap tracking
5680  */
5681
5682 struct perf_mmap_event {
5683         struct vm_area_struct   *vma;
5684
5685         const char              *file_name;
5686         int                     file_size;
5687         int                     maj, min;
5688         u64                     ino;
5689         u64                     ino_generation;
5690         u32                     prot, flags;
5691
5692         struct {
5693                 struct perf_event_header        header;
5694
5695                 u32                             pid;
5696                 u32                             tid;
5697                 u64                             start;
5698                 u64                             len;
5699                 u64                             pgoff;
5700         } event_id;
5701 };
5702
5703 static int perf_event_mmap_match(struct perf_event *event,
5704                                  void *data)
5705 {
5706         struct perf_mmap_event *mmap_event = data;
5707         struct vm_area_struct *vma = mmap_event->vma;
5708         int executable = vma->vm_flags & VM_EXEC;
5709
5710         return (!executable && event->attr.mmap_data) ||
5711                (executable && (event->attr.mmap || event->attr.mmap2));
5712 }
5713
5714 static void perf_event_mmap_output(struct perf_event *event,
5715                                    void *data)
5716 {
5717         struct perf_mmap_event *mmap_event = data;
5718         struct perf_output_handle handle;
5719         struct perf_sample_data sample;
5720         int size = mmap_event->event_id.header.size;
5721         int ret;
5722
5723         if (!perf_event_mmap_match(event, data))
5724                 return;
5725
5726         if (event->attr.mmap2) {
5727                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5728                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5729                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5730                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5731                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5732                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5733                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5734         }
5735
5736         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5737         ret = perf_output_begin(&handle, event,
5738                                 mmap_event->event_id.header.size);
5739         if (ret)
5740                 goto out;
5741
5742         mmap_event->event_id.pid = perf_event_pid(event, current);
5743         mmap_event->event_id.tid = perf_event_tid(event, current);
5744
5745         perf_output_put(&handle, mmap_event->event_id);
5746
5747         if (event->attr.mmap2) {
5748                 perf_output_put(&handle, mmap_event->maj);
5749                 perf_output_put(&handle, mmap_event->min);
5750                 perf_output_put(&handle, mmap_event->ino);
5751                 perf_output_put(&handle, mmap_event->ino_generation);
5752                 perf_output_put(&handle, mmap_event->prot);
5753                 perf_output_put(&handle, mmap_event->flags);
5754         }
5755
5756         __output_copy(&handle, mmap_event->file_name,
5757                                    mmap_event->file_size);
5758
5759         perf_event__output_id_sample(event, &handle, &sample);
5760
5761         perf_output_end(&handle);
5762 out:
5763         mmap_event->event_id.header.size = size;
5764 }
5765
5766 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5767 {
5768         struct vm_area_struct *vma = mmap_event->vma;
5769         struct file *file = vma->vm_file;
5770         int maj = 0, min = 0;
5771         u64 ino = 0, gen = 0;
5772         u32 prot = 0, flags = 0;
5773         unsigned int size;
5774         char tmp[16];
5775         char *buf = NULL;
5776         char *name;
5777
5778         if (file) {
5779                 struct inode *inode;
5780                 dev_t dev;
5781
5782                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5783                 if (!buf) {
5784                         name = "//enomem";
5785                         goto cpy_name;
5786                 }
5787                 /*
5788                  * d_path() works from the end of the rb backwards, so we
5789                  * need to add enough zero bytes after the string to handle
5790                  * the 64bit alignment we do later.
5791                  */
5792                 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5793                 if (IS_ERR(name)) {
5794                         name = "//toolong";
5795                         goto cpy_name;
5796                 }
5797                 inode = file_inode(vma->vm_file);
5798                 dev = inode->i_sb->s_dev;
5799                 ino = inode->i_ino;
5800                 gen = inode->i_generation;
5801                 maj = MAJOR(dev);
5802                 min = MINOR(dev);
5803
5804                 if (vma->vm_flags & VM_READ)
5805                         prot |= PROT_READ;
5806                 if (vma->vm_flags & VM_WRITE)
5807                         prot |= PROT_WRITE;
5808                 if (vma->vm_flags & VM_EXEC)
5809                         prot |= PROT_EXEC;
5810
5811                 if (vma->vm_flags & VM_MAYSHARE)
5812                         flags = MAP_SHARED;
5813                 else
5814                         flags = MAP_PRIVATE;
5815
5816                 if (vma->vm_flags & VM_DENYWRITE)
5817                         flags |= MAP_DENYWRITE;
5818                 if (vma->vm_flags & VM_MAYEXEC)
5819                         flags |= MAP_EXECUTABLE;
5820                 if (vma->vm_flags & VM_LOCKED)
5821                         flags |= MAP_LOCKED;
5822                 if (vma->vm_flags & VM_HUGETLB)
5823                         flags |= MAP_HUGETLB;
5824
5825                 goto got_name;
5826         } else {
5827                 if (vma->vm_ops && vma->vm_ops->name) {
5828                         name = (char *) vma->vm_ops->name(vma);
5829                         if (name)
5830                                 goto cpy_name;
5831                 }
5832
5833                 name = (char *)arch_vma_name(vma);
5834                 if (name)
5835                         goto cpy_name;
5836
5837                 if (vma->vm_start <= vma->vm_mm->start_brk &&
5838                                 vma->vm_end >= vma->vm_mm->brk) {
5839                         name = "[heap]";
5840                         goto cpy_name;
5841                 }
5842                 if (vma->vm_start <= vma->vm_mm->start_stack &&
5843                                 vma->vm_end >= vma->vm_mm->start_stack) {
5844                         name = "[stack]";
5845                         goto cpy_name;
5846                 }
5847
5848                 name = "//anon";
5849                 goto cpy_name;
5850         }
5851
5852 cpy_name:
5853         strlcpy(tmp, name, sizeof(tmp));
5854         name = tmp;
5855 got_name:
5856         /*
5857          * Since our buffer works in 8 byte units we need to align our string
5858          * size to a multiple of 8. However, we must guarantee the tail end is
5859          * zero'd out to avoid leaking random bits to userspace.
5860          */
5861         size = strlen(name)+1;
5862         while (!IS_ALIGNED(size, sizeof(u64)))
5863                 name[size++] = '\0';
5864
5865         mmap_event->file_name = name;
5866         mmap_event->file_size = size;
5867         mmap_event->maj = maj;
5868         mmap_event->min = min;
5869         mmap_event->ino = ino;
5870         mmap_event->ino_generation = gen;
5871         mmap_event->prot = prot;
5872         mmap_event->flags = flags;
5873
5874         if (!(vma->vm_flags & VM_EXEC))
5875                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5876
5877         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5878
5879         perf_event_aux(perf_event_mmap_output,
5880                        mmap_event,
5881                        NULL);
5882
5883         kfree(buf);
5884 }
5885
5886 void perf_event_mmap(struct vm_area_struct *vma)
5887 {
5888         struct perf_mmap_event mmap_event;
5889
5890         if (!atomic_read(&nr_mmap_events))
5891                 return;
5892
5893         mmap_event = (struct perf_mmap_event){
5894                 .vma    = vma,
5895                 /* .file_name */
5896                 /* .file_size */
5897                 .event_id  = {
5898                         .header = {
5899                                 .type = PERF_RECORD_MMAP,
5900                                 .misc = PERF_RECORD_MISC_USER,
5901                                 /* .size */
5902                         },
5903                         /* .pid */
5904                         /* .tid */
5905                         .start  = vma->vm_start,
5906                         .len    = vma->vm_end - vma->vm_start,
5907                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
5908                 },
5909                 /* .maj (attr_mmap2 only) */
5910                 /* .min (attr_mmap2 only) */
5911                 /* .ino (attr_mmap2 only) */
5912                 /* .ino_generation (attr_mmap2 only) */
5913                 /* .prot (attr_mmap2 only) */
5914                 /* .flags (attr_mmap2 only) */
5915         };
5916
5917         perf_event_mmap_event(&mmap_event);
5918 }
5919
5920 void perf_event_aux_event(struct perf_event *event, unsigned long head,
5921                           unsigned long size, u64 flags)
5922 {
5923         struct perf_output_handle handle;
5924         struct perf_sample_data sample;
5925         struct perf_aux_event {
5926                 struct perf_event_header        header;
5927                 u64                             offset;
5928                 u64                             size;
5929                 u64                             flags;
5930         } rec = {
5931                 .header = {
5932                         .type = PERF_RECORD_AUX,
5933                         .misc = 0,
5934                         .size = sizeof(rec),
5935                 },
5936                 .offset         = head,
5937                 .size           = size,
5938                 .flags          = flags,
5939         };
5940         int ret;
5941
5942         perf_event_header__init_id(&rec.header, &sample, event);
5943         ret = perf_output_begin(&handle, event, rec.header.size);
5944
5945         if (ret)
5946                 return;
5947
5948         perf_output_put(&handle, rec);
5949         perf_event__output_id_sample(event, &handle, &sample);
5950
5951         perf_output_end(&handle);
5952 }
5953
5954 /*
5955  * IRQ throttle logging
5956  */
5957
5958 static void perf_log_throttle(struct perf_event *event, int enable)
5959 {
5960         struct perf_output_handle handle;
5961         struct perf_sample_data sample;
5962         int ret;
5963
5964         struct {
5965                 struct perf_event_header        header;
5966                 u64                             time;
5967                 u64                             id;
5968                 u64                             stream_id;
5969         } throttle_event = {
5970                 .header = {
5971                         .type = PERF_RECORD_THROTTLE,
5972                         .misc = 0,
5973                         .size = sizeof(throttle_event),
5974                 },
5975                 .time           = perf_event_clock(event),
5976                 .id             = primary_event_id(event),
5977                 .stream_id      = event->id,
5978         };
5979
5980         if (enable)
5981                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5982
5983         perf_event_header__init_id(&throttle_event.header, &sample, event);
5984
5985         ret = perf_output_begin(&handle, event,
5986                                 throttle_event.header.size);
5987         if (ret)
5988                 return;
5989
5990         perf_output_put(&handle, throttle_event);
5991         perf_event__output_id_sample(event, &handle, &sample);
5992         perf_output_end(&handle);
5993 }
5994
5995 static void perf_log_itrace_start(struct perf_event *event)
5996 {
5997         struct perf_output_handle handle;
5998         struct perf_sample_data sample;
5999         struct perf_aux_event {
6000                 struct perf_event_header        header;
6001                 u32                             pid;
6002                 u32                             tid;
6003         } rec;
6004         int ret;
6005
6006         if (event->parent)
6007                 event = event->parent;
6008
6009         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6010             event->hw.itrace_started)
6011                 return;
6012
6013         event->hw.itrace_started = 1;
6014
6015         rec.header.type = PERF_RECORD_ITRACE_START;
6016         rec.header.misc = 0;
6017         rec.header.size = sizeof(rec);
6018         rec.pid = perf_event_pid(event, current);
6019         rec.tid = perf_event_tid(event, current);
6020
6021         perf_event_header__init_id(&rec.header, &sample, event);
6022         ret = perf_output_begin(&handle, event, rec.header.size);
6023
6024         if (ret)
6025                 return;
6026
6027         perf_output_put(&handle, rec);
6028         perf_event__output_id_sample(event, &handle, &sample);
6029
6030         perf_output_end(&handle);
6031 }
6032
6033 /*
6034  * Generic event overflow handling, sampling.
6035  */
6036
6037 static int __perf_event_overflow(struct perf_event *event,
6038                                    int throttle, struct perf_sample_data *data,
6039                                    struct pt_regs *regs)
6040 {
6041         int events = atomic_read(&event->event_limit);
6042         struct hw_perf_event *hwc = &event->hw;
6043         u64 seq;
6044         int ret = 0;
6045
6046         /*
6047          * Non-sampling counters might still use the PMI to fold short
6048          * hardware counters, ignore those.
6049          */
6050         if (unlikely(!is_sampling_event(event)))
6051                 return 0;
6052
6053         seq = __this_cpu_read(perf_throttled_seq);
6054         if (seq != hwc->interrupts_seq) {
6055                 hwc->interrupts_seq = seq;
6056                 hwc->interrupts = 1;
6057         } else {
6058                 hwc->interrupts++;
6059                 if (unlikely(throttle
6060                              && hwc->interrupts >= max_samples_per_tick)) {
6061                         __this_cpu_inc(perf_throttled_count);
6062                         hwc->interrupts = MAX_INTERRUPTS;
6063                         perf_log_throttle(event, 0);
6064                         tick_nohz_full_kick();
6065                         ret = 1;
6066                 }
6067         }
6068
6069         if (event->attr.freq) {
6070                 u64 now = perf_clock();
6071                 s64 delta = now - hwc->freq_time_stamp;
6072
6073                 hwc->freq_time_stamp = now;
6074
6075                 if (delta > 0 && delta < 2*TICK_NSEC)
6076                         perf_adjust_period(event, delta, hwc->last_period, true);
6077         }
6078
6079         /*
6080          * XXX event_limit might not quite work as expected on inherited
6081          * events
6082          */
6083
6084         event->pending_kill = POLL_IN;
6085         if (events && atomic_dec_and_test(&event->event_limit)) {
6086                 ret = 1;
6087                 event->pending_kill = POLL_HUP;
6088                 event->pending_disable = 1;
6089                 irq_work_queue(&event->pending);
6090         }
6091
6092         if (event->overflow_handler)
6093                 event->overflow_handler(event, data, regs);
6094         else
6095                 perf_event_output(event, data, regs);
6096
6097         if (event->fasync && event->pending_kill) {
6098                 event->pending_wakeup = 1;
6099                 irq_work_queue(&event->pending);
6100         }
6101
6102         return ret;
6103 }
6104
6105 int perf_event_overflow(struct perf_event *event,
6106                           struct perf_sample_data *data,
6107                           struct pt_regs *regs)
6108 {
6109         return __perf_event_overflow(event, 1, data, regs);
6110 }
6111
6112 /*
6113  * Generic software event infrastructure
6114  */
6115
6116 struct swevent_htable {
6117         struct swevent_hlist            *swevent_hlist;
6118         struct mutex                    hlist_mutex;
6119         int                             hlist_refcount;
6120
6121         /* Recursion avoidance in each contexts */
6122         int                             recursion[PERF_NR_CONTEXTS];
6123
6124         /* Keeps track of cpu being initialized/exited */
6125         bool                            online;
6126 };
6127
6128 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6129
6130 /*
6131  * We directly increment event->count and keep a second value in
6132  * event->hw.period_left to count intervals. This period event
6133  * is kept in the range [-sample_period, 0] so that we can use the
6134  * sign as trigger.
6135  */
6136
6137 u64 perf_swevent_set_period(struct perf_event *event)
6138 {
6139         struct hw_perf_event *hwc = &event->hw;
6140         u64 period = hwc->last_period;
6141         u64 nr, offset;
6142         s64 old, val;
6143
6144         hwc->last_period = hwc->sample_period;
6145
6146 again:
6147         old = val = local64_read(&hwc->period_left);
6148         if (val < 0)
6149                 return 0;
6150
6151         nr = div64_u64(period + val, period);
6152         offset = nr * period;
6153         val -= offset;
6154         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
6155                 goto again;
6156
6157         return nr;
6158 }
6159
6160 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
6161                                     struct perf_sample_data *data,
6162                                     struct pt_regs *regs)
6163 {
6164         struct hw_perf_event *hwc = &event->hw;
6165         int throttle = 0;
6166
6167         if (!overflow)
6168                 overflow = perf_swevent_set_period(event);
6169
6170         if (hwc->interrupts == MAX_INTERRUPTS)
6171                 return;
6172
6173         for (; overflow; overflow--) {
6174                 if (__perf_event_overflow(event, throttle,
6175                                             data, regs)) {
6176                         /*
6177                          * We inhibit the overflow from happening when
6178                          * hwc->interrupts == MAX_INTERRUPTS.
6179                          */
6180                         break;
6181                 }
6182                 throttle = 1;
6183         }
6184 }
6185
6186 static void perf_swevent_event(struct perf_event *event, u64 nr,
6187                                struct perf_sample_data *data,
6188                                struct pt_regs *regs)
6189 {
6190         struct hw_perf_event *hwc = &event->hw;
6191
6192         local64_add(nr, &event->count);
6193
6194         if (!regs)
6195                 return;
6196
6197         if (!is_sampling_event(event))
6198                 return;
6199
6200         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6201                 data->period = nr;
6202                 return perf_swevent_overflow(event, 1, data, regs);
6203         } else
6204                 data->period = event->hw.last_period;
6205
6206         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
6207                 return perf_swevent_overflow(event, 1, data, regs);
6208
6209         if (local64_add_negative(nr, &hwc->period_left))
6210                 return;
6211
6212         perf_swevent_overflow(event, 0, data, regs);
6213 }
6214
6215 static int perf_exclude_event(struct perf_event *event,
6216                               struct pt_regs *regs)
6217 {
6218         if (event->hw.state & PERF_HES_STOPPED)
6219                 return 1;
6220
6221         if (regs) {
6222                 if (event->attr.exclude_user && user_mode(regs))
6223                         return 1;
6224
6225                 if (event->attr.exclude_kernel && !user_mode(regs))
6226                         return 1;
6227         }
6228
6229         return 0;
6230 }
6231
6232 static int perf_swevent_match(struct perf_event *event,
6233                                 enum perf_type_id type,
6234                                 u32 event_id,
6235                                 struct perf_sample_data *data,
6236                                 struct pt_regs *regs)
6237 {
6238         if (event->attr.type != type)
6239                 return 0;
6240
6241         if (event->attr.config != event_id)
6242                 return 0;
6243
6244         if (perf_exclude_event(event, regs))
6245                 return 0;
6246
6247         return 1;
6248 }
6249
6250 static inline u64 swevent_hash(u64 type, u32 event_id)
6251 {
6252         u64 val = event_id | (type << 32);
6253
6254         return hash_64(val, SWEVENT_HLIST_BITS);
6255 }
6256
6257 static inline struct hlist_head *
6258 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6259 {
6260         u64 hash = swevent_hash(type, event_id);
6261
6262         return &hlist->heads[hash];
6263 }
6264
6265 /* For the read side: events when they trigger */
6266 static inline struct hlist_head *
6267 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6268 {
6269         struct swevent_hlist *hlist;
6270
6271         hlist = rcu_dereference(swhash->swevent_hlist);
6272         if (!hlist)
6273                 return NULL;
6274
6275         return __find_swevent_head(hlist, type, event_id);
6276 }
6277
6278 /* For the event head insertion and removal in the hlist */
6279 static inline struct hlist_head *
6280 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6281 {
6282         struct swevent_hlist *hlist;
6283         u32 event_id = event->attr.config;
6284         u64 type = event->attr.type;
6285
6286         /*
6287          * Event scheduling is always serialized against hlist allocation
6288          * and release. Which makes the protected version suitable here.
6289          * The context lock guarantees that.
6290          */
6291         hlist = rcu_dereference_protected(swhash->swevent_hlist,
6292                                           lockdep_is_held(&event->ctx->lock));
6293         if (!hlist)
6294                 return NULL;
6295
6296         return __find_swevent_head(hlist, type, event_id);
6297 }
6298
6299 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6300                                     u64 nr,
6301                                     struct perf_sample_data *data,
6302                                     struct pt_regs *regs)
6303 {
6304         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6305         struct perf_event *event;
6306         struct hlist_head *head;
6307
6308         rcu_read_lock();
6309         head = find_swevent_head_rcu(swhash, type, event_id);
6310         if (!head)
6311                 goto end;
6312
6313         hlist_for_each_entry_rcu(event, head, hlist_entry) {
6314                 if (perf_swevent_match(event, type, event_id, data, regs))
6315                         perf_swevent_event(event, nr, data, regs);
6316         }
6317 end:
6318         rcu_read_unlock();
6319 }
6320
6321 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6322
6323 int perf_swevent_get_recursion_context(void)
6324 {
6325         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6326
6327         return get_recursion_context(swhash->recursion);
6328 }
6329 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6330
6331 inline void perf_swevent_put_recursion_context(int rctx)
6332 {
6333         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6334
6335         put_recursion_context(swhash->recursion, rctx);
6336 }
6337
6338 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6339 {
6340         struct perf_sample_data data;
6341
6342         if (WARN_ON_ONCE(!regs))
6343                 return;
6344
6345         perf_sample_data_init(&data, addr, 0);
6346         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6347 }
6348
6349 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6350 {
6351         int rctx;
6352
6353         preempt_disable_notrace();
6354         rctx = perf_swevent_get_recursion_context();
6355         if (unlikely(rctx < 0))
6356                 goto fail;
6357
6358         ___perf_sw_event(event_id, nr, regs, addr);
6359
6360         perf_swevent_put_recursion_context(rctx);
6361 fail:
6362         preempt_enable_notrace();
6363 }
6364
6365 static void perf_swevent_read(struct perf_event *event)
6366 {
6367 }
6368
6369 static int perf_swevent_add(struct perf_event *event, int flags)
6370 {
6371         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6372         struct hw_perf_event *hwc = &event->hw;
6373         struct hlist_head *head;
6374
6375         if (is_sampling_event(event)) {
6376                 hwc->last_period = hwc->sample_period;
6377                 perf_swevent_set_period(event);
6378         }
6379
6380         hwc->state = !(flags & PERF_EF_START);
6381
6382         head = find_swevent_head(swhash, event);
6383         if (!head) {
6384                 /*
6385                  * We can race with cpu hotplug code. Do not
6386                  * WARN if the cpu just got unplugged.
6387                  */
6388                 WARN_ON_ONCE(swhash->online);
6389                 return -EINVAL;
6390         }
6391
6392         hlist_add_head_rcu(&event->hlist_entry, head);
6393         perf_event_update_userpage(event);
6394
6395         return 0;
6396 }
6397
6398 static void perf_swevent_del(struct perf_event *event, int flags)
6399 {
6400         hlist_del_rcu(&event->hlist_entry);
6401 }
6402
6403 static void perf_swevent_start(struct perf_event *event, int flags)
6404 {
6405         event->hw.state = 0;
6406 }
6407
6408 static void perf_swevent_stop(struct perf_event *event, int flags)
6409 {
6410         event->hw.state = PERF_HES_STOPPED;
6411 }
6412
6413 /* Deref the hlist from the update side */
6414 static inline struct swevent_hlist *
6415 swevent_hlist_deref(struct swevent_htable *swhash)
6416 {
6417         return rcu_dereference_protected(swhash->swevent_hlist,
6418                                          lockdep_is_held(&swhash->hlist_mutex));
6419 }
6420
6421 static void swevent_hlist_release(struct swevent_htable *swhash)
6422 {
6423         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6424
6425         if (!hlist)
6426                 return;
6427
6428         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6429         kfree_rcu(hlist, rcu_head);
6430 }
6431
6432 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6433 {
6434         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6435
6436         mutex_lock(&swhash->hlist_mutex);
6437
6438         if (!--swhash->hlist_refcount)
6439                 swevent_hlist_release(swhash);
6440
6441         mutex_unlock(&swhash->hlist_mutex);
6442 }
6443
6444 static void swevent_hlist_put(struct perf_event *event)
6445 {
6446         int cpu;
6447
6448         for_each_possible_cpu(cpu)
6449                 swevent_hlist_put_cpu(event, cpu);
6450 }
6451
6452 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6453 {
6454         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6455         int err = 0;
6456
6457         mutex_lock(&swhash->hlist_mutex);
6458
6459         if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6460                 struct swevent_hlist *hlist;
6461
6462                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6463                 if (!hlist) {
6464                         err = -ENOMEM;
6465                         goto exit;
6466                 }
6467                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6468         }
6469         swhash->hlist_refcount++;
6470 exit:
6471         mutex_unlock(&swhash->hlist_mutex);
6472
6473         return err;
6474 }
6475
6476 static int swevent_hlist_get(struct perf_event *event)
6477 {
6478         int err;
6479         int cpu, failed_cpu;
6480
6481         get_online_cpus();
6482         for_each_possible_cpu(cpu) {
6483                 err = swevent_hlist_get_cpu(event, cpu);
6484                 if (err) {
6485                         failed_cpu = cpu;
6486                         goto fail;
6487                 }
6488         }
6489         put_online_cpus();
6490
6491         return 0;
6492 fail:
6493         for_each_possible_cpu(cpu) {
6494                 if (cpu == failed_cpu)
6495                         break;
6496                 swevent_hlist_put_cpu(event, cpu);
6497         }
6498
6499         put_online_cpus();
6500         return err;
6501 }
6502
6503 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6504
6505 static void sw_perf_event_destroy(struct perf_event *event)
6506 {
6507         u64 event_id = event->attr.config;
6508
6509         WARN_ON(event->parent);
6510
6511         static_key_slow_dec(&perf_swevent_enabled[event_id]);
6512         swevent_hlist_put(event);
6513 }
6514
6515 static int perf_swevent_init(struct perf_event *event)
6516 {
6517         u64 event_id = event->attr.config;
6518
6519         if (event->attr.type != PERF_TYPE_SOFTWARE)
6520                 return -ENOENT;
6521
6522         /*
6523          * no branch sampling for software events
6524          */
6525         if (has_branch_stack(event))
6526                 return -EOPNOTSUPP;
6527
6528         switch (event_id) {
6529         case PERF_COUNT_SW_CPU_CLOCK:
6530         case PERF_COUNT_SW_TASK_CLOCK:
6531                 return -ENOENT;
6532
6533         default:
6534                 break;
6535         }
6536
6537         if (event_id >= PERF_COUNT_SW_MAX)
6538                 return -ENOENT;
6539
6540         if (!event->parent) {
6541                 int err;
6542
6543                 err = swevent_hlist_get(event);
6544                 if (err)
6545                         return err;
6546
6547                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6548                 event->destroy = sw_perf_event_destroy;
6549         }
6550
6551         return 0;
6552 }
6553
6554 static struct pmu perf_swevent = {
6555         .task_ctx_nr    = perf_sw_context,
6556
6557         .capabilities   = PERF_PMU_CAP_NO_NMI,
6558
6559         .event_init     = perf_swevent_init,
6560         .add            = perf_swevent_add,
6561         .del            = perf_swevent_del,
6562         .start          = perf_swevent_start,
6563         .stop           = perf_swevent_stop,
6564         .read           = perf_swevent_read,
6565 };
6566
6567 #ifdef CONFIG_EVENT_TRACING
6568
6569 static int perf_tp_filter_match(struct perf_event *event,
6570                                 struct perf_sample_data *data)
6571 {
6572         void *record = data->raw->data;
6573
6574         if (likely(!event->filter) || filter_match_preds(event->filter, record))
6575                 return 1;
6576         return 0;
6577 }
6578
6579 static int perf_tp_event_match(struct perf_event *event,
6580                                 struct perf_sample_data *data,
6581                                 struct pt_regs *regs)
6582 {
6583         if (event->hw.state & PERF_HES_STOPPED)
6584                 return 0;
6585         /*
6586          * All tracepoints are from kernel-space.
6587          */
6588         if (event->attr.exclude_kernel)
6589                 return 0;
6590
6591         if (!perf_tp_filter_match(event, data))
6592                 return 0;
6593
6594         return 1;
6595 }
6596
6597 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6598                    struct pt_regs *regs, struct hlist_head *head, int rctx,
6599                    struct task_struct *task)
6600 {
6601         struct perf_sample_data data;
6602         struct perf_event *event;
6603
6604         struct perf_raw_record raw = {
6605                 .size = entry_size,
6606                 .data = record,
6607         };
6608
6609         perf_sample_data_init(&data, addr, 0);
6610         data.raw = &raw;
6611
6612         hlist_for_each_entry_rcu(event, head, hlist_entry) {
6613                 if (perf_tp_event_match(event, &data, regs))
6614                         perf_swevent_event(event, count, &data, regs);
6615         }
6616
6617         /*
6618          * If we got specified a target task, also iterate its context and
6619          * deliver this event there too.
6620          */
6621         if (task && task != current) {
6622                 struct perf_event_context *ctx;
6623                 struct trace_entry *entry = record;
6624
6625                 rcu_read_lock();
6626                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6627                 if (!ctx)
6628                         goto unlock;
6629
6630                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6631                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6632                                 continue;
6633                         if (event->attr.config != entry->type)
6634                                 continue;
6635                         if (perf_tp_event_match(event, &data, regs))
6636                                 perf_swevent_event(event, count, &data, regs);
6637                 }
6638 unlock:
6639                 rcu_read_unlock();
6640         }
6641
6642         perf_swevent_put_recursion_context(rctx);
6643 }
6644 EXPORT_SYMBOL_GPL(perf_tp_event);
6645
6646 static void tp_perf_event_destroy(struct perf_event *event)
6647 {
6648         perf_trace_destroy(event);
6649 }
6650
6651 static int perf_tp_event_init(struct perf_event *event)
6652 {
6653         int err;
6654
6655         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6656                 return -ENOENT;
6657
6658         /*
6659          * no branch sampling for tracepoint events
6660          */
6661         if (has_branch_stack(event))
6662                 return -EOPNOTSUPP;
6663
6664         err = perf_trace_init(event);
6665         if (err)
6666                 return err;
6667
6668         event->destroy = tp_perf_event_destroy;
6669
6670         return 0;
6671 }
6672
6673 static struct pmu perf_tracepoint = {
6674         .task_ctx_nr    = perf_sw_context,
6675
6676         .event_init     = perf_tp_event_init,
6677         .add            = perf_trace_add,
6678         .del            = perf_trace_del,
6679         .start          = perf_swevent_start,
6680         .stop           = perf_swevent_stop,
6681         .read           = perf_swevent_read,
6682 };
6683
6684 static inline void perf_tp_register(void)
6685 {
6686         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
6687 }
6688
6689 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6690 {
6691         char *filter_str;
6692         int ret;
6693
6694         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6695                 return -EINVAL;
6696
6697         filter_str = strndup_user(arg, PAGE_SIZE);
6698         if (IS_ERR(filter_str))
6699                 return PTR_ERR(filter_str);
6700
6701         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6702
6703         kfree(filter_str);
6704         return ret;
6705 }
6706
6707 static void perf_event_free_filter(struct perf_event *event)
6708 {
6709         ftrace_profile_free_filter(event);
6710 }
6711
6712 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6713 {
6714         struct bpf_prog *prog;
6715
6716         if (event->attr.type != PERF_TYPE_TRACEPOINT)
6717                 return -EINVAL;
6718
6719         if (event->tp_event->prog)
6720                 return -EEXIST;
6721
6722         if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6723                 /* bpf programs can only be attached to kprobes */
6724                 return -EINVAL;
6725
6726         prog = bpf_prog_get(prog_fd);
6727         if (IS_ERR(prog))
6728                 return PTR_ERR(prog);
6729
6730         if (prog->type != BPF_PROG_TYPE_KPROBE) {
6731                 /* valid fd, but invalid bpf program type */
6732                 bpf_prog_put(prog);
6733                 return -EINVAL;
6734         }
6735
6736         event->tp_event->prog = prog;
6737
6738         return 0;
6739 }
6740
6741 static void perf_event_free_bpf_prog(struct perf_event *event)
6742 {
6743         struct bpf_prog *prog;
6744
6745         if (!event->tp_event)
6746                 return;
6747
6748         prog = event->tp_event->prog;
6749         if (prog) {
6750                 event->tp_event->prog = NULL;
6751                 bpf_prog_put(prog);
6752         }
6753 }
6754
6755 #else
6756
6757 static inline void perf_tp_register(void)
6758 {
6759 }
6760
6761 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6762 {
6763         return -ENOENT;
6764 }
6765
6766 static void perf_event_free_filter(struct perf_event *event)
6767 {
6768 }
6769
6770 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6771 {
6772         return -ENOENT;
6773 }
6774
6775 static void perf_event_free_bpf_prog(struct perf_event *event)
6776 {
6777 }
6778 #endif /* CONFIG_EVENT_TRACING */
6779
6780 #ifdef CONFIG_HAVE_HW_BREAKPOINT
6781 void perf_bp_event(struct perf_event *bp, void *data)
6782 {
6783         struct perf_sample_data sample;
6784         struct pt_regs *regs = data;
6785
6786         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
6787
6788         if (!bp->hw.state && !perf_exclude_event(bp, regs))
6789                 perf_swevent_event(bp, 1, &sample, regs);
6790 }
6791 #endif
6792
6793 /*
6794  * hrtimer based swevent callback
6795  */
6796
6797 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
6798 {
6799         enum hrtimer_restart ret = HRTIMER_RESTART;
6800         struct perf_sample_data data;
6801         struct pt_regs *regs;
6802         struct perf_event *event;
6803         u64 period;
6804
6805         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
6806
6807         if (event->state != PERF_EVENT_STATE_ACTIVE)
6808                 return HRTIMER_NORESTART;
6809
6810         event->pmu->read(event);
6811
6812         perf_sample_data_init(&data, 0, event->hw.last_period);
6813         regs = get_irq_regs();
6814
6815         if (regs && !perf_exclude_event(event, regs)) {
6816                 if (!(event->attr.exclude_idle && is_idle_task(current)))
6817                         if (__perf_event_overflow(event, 1, &data, regs))
6818                                 ret = HRTIMER_NORESTART;
6819         }
6820
6821         period = max_t(u64, 10000, event->hw.sample_period);
6822         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
6823
6824         return ret;
6825 }
6826
6827 static void perf_swevent_start_hrtimer(struct perf_event *event)
6828 {
6829         struct hw_perf_event *hwc = &event->hw;
6830         s64 period;
6831
6832         if (!is_sampling_event(event))
6833                 return;
6834
6835         period = local64_read(&hwc->period_left);
6836         if (period) {
6837                 if (period < 0)
6838                         period = 10000;
6839
6840                 local64_set(&hwc->period_left, 0);
6841         } else {
6842                 period = max_t(u64, 10000, hwc->sample_period);
6843         }
6844         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
6845                       HRTIMER_MODE_REL_PINNED);
6846 }
6847
6848 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
6849 {
6850         struct hw_perf_event *hwc = &event->hw;
6851
6852         if (is_sampling_event(event)) {
6853                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
6854                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
6855
6856                 hrtimer_cancel(&hwc->hrtimer);
6857         }
6858 }
6859
6860 static void perf_swevent_init_hrtimer(struct perf_event *event)
6861 {
6862         struct hw_perf_event *hwc = &event->hw;
6863
6864         if (!is_sampling_event(event))
6865                 return;
6866
6867         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6868         hwc->hrtimer.function = perf_swevent_hrtimer;
6869
6870         /*
6871          * Since hrtimers have a fixed rate, we can do a static freq->period
6872          * mapping and avoid the whole period adjust feedback stuff.
6873          */
6874         if (event->attr.freq) {
6875                 long freq = event->attr.sample_freq;
6876
6877                 event->attr.sample_period = NSEC_PER_SEC / freq;
6878                 hwc->sample_period = event->attr.sample_period;
6879                 local64_set(&hwc->period_left, hwc->sample_period);
6880                 hwc->last_period = hwc->sample_period;
6881                 event->attr.freq = 0;
6882         }
6883 }
6884
6885 /*
6886  * Software event: cpu wall time clock
6887  */
6888
6889 static void cpu_clock_event_update(struct perf_event *event)
6890 {
6891         s64 prev;
6892         u64 now;
6893
6894         now = local_clock();
6895         prev = local64_xchg(&event->hw.prev_count, now);
6896         local64_add(now - prev, &event->count);
6897 }
6898
6899 static void cpu_clock_event_start(struct perf_event *event, int flags)
6900 {
6901         local64_set(&event->hw.prev_count, local_clock());
6902         perf_swevent_start_hrtimer(event);
6903 }
6904
6905 static void cpu_clock_event_stop(struct perf_event *event, int flags)
6906 {
6907         perf_swevent_cancel_hrtimer(event);
6908         cpu_clock_event_update(event);
6909 }
6910
6911 static int cpu_clock_event_add(struct perf_event *event, int flags)
6912 {
6913         if (flags & PERF_EF_START)
6914                 cpu_clock_event_start(event, flags);
6915         perf_event_update_userpage(event);
6916
6917         return 0;
6918 }
6919
6920 static void cpu_clock_event_del(struct perf_event *event, int flags)
6921 {
6922         cpu_clock_event_stop(event, flags);
6923 }
6924
6925 static void cpu_clock_event_read(struct perf_event *event)
6926 {
6927         cpu_clock_event_update(event);
6928 }
6929
6930 static int cpu_clock_event_init(struct perf_event *event)
6931 {
6932         if (event->attr.type != PERF_TYPE_SOFTWARE)
6933                 return -ENOENT;
6934
6935         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6936                 return -ENOENT;
6937
6938         /*
6939          * no branch sampling for software events
6940          */
6941         if (has_branch_stack(event))
6942                 return -EOPNOTSUPP;
6943
6944         perf_swevent_init_hrtimer(event);
6945
6946         return 0;
6947 }
6948
6949 static struct pmu perf_cpu_clock = {
6950         .task_ctx_nr    = perf_sw_context,
6951
6952         .capabilities   = PERF_PMU_CAP_NO_NMI,
6953
6954         .event_init     = cpu_clock_event_init,
6955         .add            = cpu_clock_event_add,
6956         .del            = cpu_clock_event_del,
6957         .start          = cpu_clock_event_start,
6958         .stop           = cpu_clock_event_stop,
6959         .read           = cpu_clock_event_read,
6960 };
6961
6962 /*
6963  * Software event: task time clock
6964  */
6965
6966 static void task_clock_event_update(struct perf_event *event, u64 now)
6967 {
6968         u64 prev;
6969         s64 delta;
6970
6971         prev = local64_xchg(&event->hw.prev_count, now);
6972         delta = now - prev;
6973         local64_add(delta, &event->count);
6974 }
6975
6976 static void task_clock_event_start(struct perf_event *event, int flags)
6977 {
6978         local64_set(&event->hw.prev_count, event->ctx->time);
6979         perf_swevent_start_hrtimer(event);
6980 }
6981
6982 static void task_clock_event_stop(struct perf_event *event, int flags)
6983 {
6984         perf_swevent_cancel_hrtimer(event);
6985         task_clock_event_update(event, event->ctx->time);
6986 }
6987
6988 static int task_clock_event_add(struct perf_event *event, int flags)
6989 {
6990         if (flags & PERF_EF_START)
6991                 task_clock_event_start(event, flags);
6992         perf_event_update_userpage(event);
6993
6994         return 0;
6995 }
6996
6997 static void task_clock_event_del(struct perf_event *event, int flags)
6998 {
6999         task_clock_event_stop(event, PERF_EF_UPDATE);
7000 }
7001
7002 static void task_clock_event_read(struct perf_event *event)
7003 {
7004         u64 now = perf_clock();
7005         u64 delta = now - event->ctx->timestamp;
7006         u64 time = event->ctx->time + delta;
7007
7008         task_clock_event_update(event, time);
7009 }
7010
7011 static int task_clock_event_init(struct perf_event *event)
7012 {
7013         if (event->attr.type != PERF_TYPE_SOFTWARE)
7014                 return -ENOENT;
7015
7016         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7017                 return -ENOENT;
7018
7019         /*
7020          * no branch sampling for software events
7021          */
7022         if (has_branch_stack(event))
7023                 return -EOPNOTSUPP;
7024
7025         perf_swevent_init_hrtimer(event);
7026
7027         return 0;
7028 }
7029
7030 static struct pmu perf_task_clock = {
7031         .task_ctx_nr    = perf_sw_context,
7032
7033         .capabilities   = PERF_PMU_CAP_NO_NMI,
7034
7035         .event_init     = task_clock_event_init,
7036         .add            = task_clock_event_add,
7037         .del            = task_clock_event_del,
7038         .start          = task_clock_event_start,
7039         .stop           = task_clock_event_stop,
7040         .read           = task_clock_event_read,
7041 };
7042
7043 static void perf_pmu_nop_void(struct pmu *pmu)
7044 {
7045 }
7046
7047 static int perf_pmu_nop_int(struct pmu *pmu)
7048 {
7049         return 0;
7050 }
7051
7052 static void perf_pmu_start_txn(struct pmu *pmu)
7053 {
7054         perf_pmu_disable(pmu);
7055 }
7056
7057 static int perf_pmu_commit_txn(struct pmu *pmu)
7058 {
7059         perf_pmu_enable(pmu);
7060         return 0;
7061 }
7062
7063 static void perf_pmu_cancel_txn(struct pmu *pmu)
7064 {
7065         perf_pmu_enable(pmu);
7066 }
7067
7068 static int perf_event_idx_default(struct perf_event *event)
7069 {
7070         return 0;
7071 }
7072
7073 /*
7074  * Ensures all contexts with the same task_ctx_nr have the same
7075  * pmu_cpu_context too.
7076  */
7077 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
7078 {
7079         struct pmu *pmu;
7080
7081         if (ctxn < 0)
7082                 return NULL;
7083
7084         list_for_each_entry(pmu, &pmus, entry) {
7085                 if (pmu->task_ctx_nr == ctxn)
7086                         return pmu->pmu_cpu_context;
7087         }
7088
7089         return NULL;
7090 }
7091
7092 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
7093 {
7094         int cpu;
7095
7096         for_each_possible_cpu(cpu) {
7097                 struct perf_cpu_context *cpuctx;
7098
7099                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7100
7101                 if (cpuctx->unique_pmu == old_pmu)
7102                         cpuctx->unique_pmu = pmu;
7103         }
7104 }
7105
7106 static void free_pmu_context(struct pmu *pmu)
7107 {
7108         struct pmu *i;
7109
7110         mutex_lock(&pmus_lock);
7111         /*
7112          * Like a real lame refcount.
7113          */
7114         list_for_each_entry(i, &pmus, entry) {
7115                 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7116                         update_pmu_context(i, pmu);
7117                         goto out;
7118                 }
7119         }
7120
7121         free_percpu(pmu->pmu_cpu_context);
7122 out:
7123         mutex_unlock(&pmus_lock);
7124 }
7125 static struct idr pmu_idr;
7126
7127 static ssize_t
7128 type_show(struct device *dev, struct device_attribute *attr, char *page)
7129 {
7130         struct pmu *pmu = dev_get_drvdata(dev);
7131
7132         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7133 }
7134 static DEVICE_ATTR_RO(type);
7135
7136 static ssize_t
7137 perf_event_mux_interval_ms_show(struct device *dev,
7138                                 struct device_attribute *attr,
7139                                 char *page)
7140 {
7141         struct pmu *pmu = dev_get_drvdata(dev);
7142
7143         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7144 }
7145
7146 static ssize_t
7147 perf_event_mux_interval_ms_store(struct device *dev,
7148                                  struct device_attribute *attr,
7149                                  const char *buf, size_t count)
7150 {
7151         struct pmu *pmu = dev_get_drvdata(dev);
7152         int timer, cpu, ret;
7153
7154         ret = kstrtoint(buf, 0, &timer);
7155         if (ret)
7156                 return ret;
7157
7158         if (timer < 1)
7159                 return -EINVAL;
7160
7161         /* same value, noting to do */
7162         if (timer == pmu->hrtimer_interval_ms)
7163                 return count;
7164
7165         pmu->hrtimer_interval_ms = timer;
7166
7167         /* update all cpuctx for this PMU */
7168         for_each_possible_cpu(cpu) {
7169                 struct perf_cpu_context *cpuctx;
7170                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7171                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7172
7173                 if (hrtimer_active(&cpuctx->hrtimer))
7174                         hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
7175         }
7176
7177         return count;
7178 }
7179 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
7180
7181 static struct attribute *pmu_dev_attrs[] = {
7182         &dev_attr_type.attr,
7183         &dev_attr_perf_event_mux_interval_ms.attr,
7184         NULL,
7185 };
7186 ATTRIBUTE_GROUPS(pmu_dev);
7187
7188 static int pmu_bus_running;
7189 static struct bus_type pmu_bus = {
7190         .name           = "event_source",
7191         .dev_groups     = pmu_dev_groups,
7192 };
7193
7194 static void pmu_dev_release(struct device *dev)
7195 {
7196         kfree(dev);
7197 }
7198
7199 static int pmu_dev_alloc(struct pmu *pmu)
7200 {
7201         int ret = -ENOMEM;
7202
7203         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7204         if (!pmu->dev)
7205                 goto out;
7206
7207         pmu->dev->groups = pmu->attr_groups;
7208         device_initialize(pmu->dev);
7209         ret = dev_set_name(pmu->dev, "%s", pmu->name);
7210         if (ret)
7211                 goto free_dev;
7212
7213         dev_set_drvdata(pmu->dev, pmu);
7214         pmu->dev->bus = &pmu_bus;
7215         pmu->dev->release = pmu_dev_release;
7216         ret = device_add(pmu->dev);
7217         if (ret)
7218                 goto free_dev;
7219
7220 out:
7221         return ret;
7222
7223 free_dev:
7224         put_device(pmu->dev);
7225         goto out;
7226 }
7227
7228 static struct lock_class_key cpuctx_mutex;
7229 static struct lock_class_key cpuctx_lock;
7230
7231 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
7232 {
7233         int cpu, ret;
7234
7235         mutex_lock(&pmus_lock);
7236         ret = -ENOMEM;
7237         pmu->pmu_disable_count = alloc_percpu(int);
7238         if (!pmu->pmu_disable_count)
7239                 goto unlock;
7240
7241         pmu->type = -1;
7242         if (!name)
7243                 goto skip_type;
7244         pmu->name = name;
7245
7246         if (type < 0) {
7247                 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7248                 if (type < 0) {
7249                         ret = type;
7250                         goto free_pdc;
7251                 }
7252         }
7253         pmu->type = type;
7254
7255         if (pmu_bus_running) {
7256                 ret = pmu_dev_alloc(pmu);
7257                 if (ret)
7258                         goto free_idr;
7259         }
7260
7261 skip_type:
7262         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7263         if (pmu->pmu_cpu_context)
7264                 goto got_cpu_context;
7265
7266         ret = -ENOMEM;
7267         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7268         if (!pmu->pmu_cpu_context)
7269                 goto free_dev;
7270
7271         for_each_possible_cpu(cpu) {
7272                 struct perf_cpu_context *cpuctx;
7273
7274                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7275                 __perf_event_init_context(&cpuctx->ctx);
7276                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
7277                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
7278                 cpuctx->ctx.pmu = pmu;
7279
7280                 __perf_cpu_hrtimer_init(cpuctx, cpu);
7281
7282                 cpuctx->unique_pmu = pmu;
7283         }
7284
7285 got_cpu_context:
7286         if (!pmu->start_txn) {
7287                 if (pmu->pmu_enable) {
7288                         /*
7289                          * If we have pmu_enable/pmu_disable calls, install
7290                          * transaction stubs that use that to try and batch
7291                          * hardware accesses.
7292                          */
7293                         pmu->start_txn  = perf_pmu_start_txn;
7294                         pmu->commit_txn = perf_pmu_commit_txn;
7295                         pmu->cancel_txn = perf_pmu_cancel_txn;
7296                 } else {
7297                         pmu->start_txn  = perf_pmu_nop_void;
7298                         pmu->commit_txn = perf_pmu_nop_int;
7299                         pmu->cancel_txn = perf_pmu_nop_void;
7300                 }
7301         }
7302
7303         if (!pmu->pmu_enable) {
7304                 pmu->pmu_enable  = perf_pmu_nop_void;
7305                 pmu->pmu_disable = perf_pmu_nop_void;
7306         }
7307
7308         if (!pmu->event_idx)
7309                 pmu->event_idx = perf_event_idx_default;
7310
7311         list_add_rcu(&pmu->entry, &pmus);
7312         atomic_set(&pmu->exclusive_cnt, 0);
7313         ret = 0;
7314 unlock:
7315         mutex_unlock(&pmus_lock);
7316
7317         return ret;
7318
7319 free_dev:
7320         device_del(pmu->dev);
7321         put_device(pmu->dev);
7322
7323 free_idr:
7324         if (pmu->type >= PERF_TYPE_MAX)
7325                 idr_remove(&pmu_idr, pmu->type);
7326
7327 free_pdc:
7328         free_percpu(pmu->pmu_disable_count);
7329         goto unlock;
7330 }
7331 EXPORT_SYMBOL_GPL(perf_pmu_register);
7332
7333 void perf_pmu_unregister(struct pmu *pmu)
7334 {
7335         mutex_lock(&pmus_lock);
7336         list_del_rcu(&pmu->entry);
7337         mutex_unlock(&pmus_lock);
7338
7339         /*
7340          * We dereference the pmu list under both SRCU and regular RCU, so
7341          * synchronize against both of those.
7342          */
7343         synchronize_srcu(&pmus_srcu);
7344         synchronize_rcu();
7345
7346         free_percpu(pmu->pmu_disable_count);
7347         if (pmu->type >= PERF_TYPE_MAX)
7348                 idr_remove(&pmu_idr, pmu->type);
7349         device_del(pmu->dev);
7350         put_device(pmu->dev);
7351         free_pmu_context(pmu);
7352 }
7353 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7354
7355 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7356 {
7357         struct perf_event_context *ctx = NULL;
7358         int ret;
7359
7360         if (!try_module_get(pmu->module))
7361                 return -ENODEV;
7362
7363         if (event->group_leader != event) {
7364                 ctx = perf_event_ctx_lock(event->group_leader);
7365                 BUG_ON(!ctx);
7366         }
7367
7368         event->pmu = pmu;
7369         ret = pmu->event_init(event);
7370
7371         if (ctx)
7372                 perf_event_ctx_unlock(event->group_leader, ctx);
7373
7374         if (ret)
7375                 module_put(pmu->module);
7376
7377         return ret;
7378 }
7379
7380 struct pmu *perf_init_event(struct perf_event *event)
7381 {
7382         struct pmu *pmu = NULL;
7383         int idx;
7384         int ret;
7385
7386         idx = srcu_read_lock(&pmus_srcu);
7387
7388         rcu_read_lock();
7389         pmu = idr_find(&pmu_idr, event->attr.type);
7390         rcu_read_unlock();
7391         if (pmu) {
7392                 ret = perf_try_init_event(pmu, event);
7393                 if (ret)
7394                         pmu = ERR_PTR(ret);
7395                 goto unlock;
7396         }
7397
7398         list_for_each_entry_rcu(pmu, &pmus, entry) {
7399                 ret = perf_try_init_event(pmu, event);
7400                 if (!ret)
7401                         goto unlock;
7402
7403                 if (ret != -ENOENT) {
7404                         pmu = ERR_PTR(ret);
7405                         goto unlock;
7406                 }
7407         }
7408         pmu = ERR_PTR(-ENOENT);
7409 unlock:
7410         srcu_read_unlock(&pmus_srcu, idx);
7411
7412         return pmu;
7413 }
7414
7415 static void account_event_cpu(struct perf_event *event, int cpu)
7416 {
7417         if (event->parent)
7418                 return;
7419
7420         if (is_cgroup_event(event))
7421                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7422 }
7423
7424 static void account_event(struct perf_event *event)
7425 {
7426         if (event->parent)
7427                 return;
7428
7429         if (event->attach_state & PERF_ATTACH_TASK)
7430                 static_key_slow_inc(&perf_sched_events.key);
7431         if (event->attr.mmap || event->attr.mmap_data)
7432                 atomic_inc(&nr_mmap_events);
7433         if (event->attr.comm)
7434                 atomic_inc(&nr_comm_events);
7435         if (event->attr.task)
7436                 atomic_inc(&nr_task_events);
7437         if (event->attr.freq) {
7438                 if (atomic_inc_return(&nr_freq_events) == 1)
7439                         tick_nohz_full_kick_all();
7440         }
7441         if (has_branch_stack(event))
7442                 static_key_slow_inc(&perf_sched_events.key);
7443         if (is_cgroup_event(event))
7444                 static_key_slow_inc(&perf_sched_events.key);
7445
7446         account_event_cpu(event, event->cpu);
7447 }
7448
7449 /*
7450  * Allocate and initialize a event structure
7451  */
7452 static struct perf_event *
7453 perf_event_alloc(struct perf_event_attr *attr, int cpu,
7454                  struct task_struct *task,
7455                  struct perf_event *group_leader,
7456                  struct perf_event *parent_event,
7457                  perf_overflow_handler_t overflow_handler,
7458                  void *context, int cgroup_fd)
7459 {
7460         struct pmu *pmu;
7461         struct perf_event *event;
7462         struct hw_perf_event *hwc;
7463         long err = -EINVAL;
7464
7465         if ((unsigned)cpu >= nr_cpu_ids) {
7466                 if (!task || cpu != -1)
7467                         return ERR_PTR(-EINVAL);
7468         }
7469
7470         event = kzalloc(sizeof(*event), GFP_KERNEL);
7471         if (!event)
7472                 return ERR_PTR(-ENOMEM);
7473
7474         /*
7475          * Single events are their own group leaders, with an
7476          * empty sibling list:
7477          */
7478         if (!group_leader)
7479                 group_leader = event;
7480
7481         mutex_init(&event->child_mutex);
7482         INIT_LIST_HEAD(&event->child_list);
7483
7484         INIT_LIST_HEAD(&event->group_entry);
7485         INIT_LIST_HEAD(&event->event_entry);
7486         INIT_LIST_HEAD(&event->sibling_list);
7487         INIT_LIST_HEAD(&event->rb_entry);
7488         INIT_LIST_HEAD(&event->active_entry);
7489         INIT_HLIST_NODE(&event->hlist_entry);
7490
7491
7492         init_waitqueue_head(&event->waitq);
7493         init_irq_work(&event->pending, perf_pending_event);
7494
7495         mutex_init(&event->mmap_mutex);
7496
7497         atomic_long_set(&event->refcount, 1);
7498         event->cpu              = cpu;
7499         event->attr             = *attr;
7500         event->group_leader     = group_leader;
7501         event->pmu              = NULL;
7502         event->oncpu            = -1;
7503
7504         event->parent           = parent_event;
7505
7506         event->ns               = get_pid_ns(task_active_pid_ns(current));
7507         event->id               = atomic64_inc_return(&perf_event_id);
7508
7509         event->state            = PERF_EVENT_STATE_INACTIVE;
7510
7511         if (task) {
7512                 event->attach_state = PERF_ATTACH_TASK;
7513                 /*
7514                  * XXX pmu::event_init needs to know what task to account to
7515                  * and we cannot use the ctx information because we need the
7516                  * pmu before we get a ctx.
7517                  */
7518                 event->hw.target = task;
7519         }
7520
7521         event->clock = &local_clock;
7522         if (parent_event)
7523                 event->clock = parent_event->clock;
7524
7525         if (!overflow_handler && parent_event) {
7526                 overflow_handler = parent_event->overflow_handler;
7527                 context = parent_event->overflow_handler_context;
7528         }
7529
7530         event->overflow_handler = overflow_handler;
7531         event->overflow_handler_context = context;
7532
7533         perf_event__state_init(event);
7534
7535         pmu = NULL;
7536
7537         hwc = &event->hw;
7538         hwc->sample_period = attr->sample_period;
7539         if (attr->freq && attr->sample_freq)
7540                 hwc->sample_period = 1;
7541         hwc->last_period = hwc->sample_period;
7542
7543         local64_set(&hwc->period_left, hwc->sample_period);
7544
7545         /*
7546          * we currently do not support PERF_FORMAT_GROUP on inherited events
7547          */
7548         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7549                 goto err_ns;
7550
7551         if (!has_branch_stack(event))
7552                 event->attr.branch_sample_type = 0;
7553
7554         if (cgroup_fd != -1) {
7555                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7556                 if (err)
7557                         goto err_ns;
7558         }
7559
7560         pmu = perf_init_event(event);
7561         if (!pmu)
7562                 goto err_ns;
7563         else if (IS_ERR(pmu)) {
7564                 err = PTR_ERR(pmu);
7565                 goto err_ns;
7566         }
7567
7568         err = exclusive_event_init(event);
7569         if (err)
7570                 goto err_pmu;
7571
7572         if (!event->parent) {
7573                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7574                         err = get_callchain_buffers();
7575                         if (err)
7576                                 goto err_per_task;
7577                 }
7578         }
7579
7580         return event;
7581
7582 err_per_task:
7583         exclusive_event_destroy(event);
7584
7585 err_pmu:
7586         if (event->destroy)
7587                 event->destroy(event);
7588         module_put(pmu->module);
7589 err_ns:
7590         if (is_cgroup_event(event))
7591                 perf_detach_cgroup(event);
7592         if (event->ns)
7593                 put_pid_ns(event->ns);
7594         kfree(event);
7595
7596         return ERR_PTR(err);
7597 }
7598
7599 static int perf_copy_attr(struct perf_event_attr __user *uattr,
7600                           struct perf_event_attr *attr)
7601 {
7602         u32 size;
7603         int ret;
7604
7605         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7606                 return -EFAULT;
7607
7608         /*
7609          * zero the full structure, so that a short copy will be nice.
7610          */
7611         memset(attr, 0, sizeof(*attr));
7612
7613         ret = get_user(size, &uattr->size);
7614         if (ret)
7615                 return ret;
7616
7617         if (size > PAGE_SIZE)   /* silly large */
7618                 goto err_size;
7619
7620         if (!size)              /* abi compat */
7621                 size = PERF_ATTR_SIZE_VER0;
7622
7623         if (size < PERF_ATTR_SIZE_VER0)
7624                 goto err_size;
7625
7626         /*
7627          * If we're handed a bigger struct than we know of,
7628          * ensure all the unknown bits are 0 - i.e. new
7629          * user-space does not rely on any kernel feature
7630          * extensions we dont know about yet.
7631          */
7632         if (size > sizeof(*attr)) {
7633                 unsigned char __user *addr;
7634                 unsigned char __user *end;
7635                 unsigned char val;
7636
7637                 addr = (void __user *)uattr + sizeof(*attr);
7638                 end  = (void __user *)uattr + size;
7639
7640                 for (; addr < end; addr++) {
7641                         ret = get_user(val, addr);
7642                         if (ret)
7643                                 return ret;
7644                         if (val)
7645                                 goto err_size;
7646                 }
7647                 size = sizeof(*attr);
7648         }
7649
7650         ret = copy_from_user(attr, uattr, size);
7651         if (ret)
7652                 return -EFAULT;
7653
7654         if (attr->__reserved_1)
7655                 return -EINVAL;
7656
7657         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7658                 return -EINVAL;
7659
7660         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7661                 return -EINVAL;
7662
7663         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7664                 u64 mask = attr->branch_sample_type;
7665
7666                 /* only using defined bits */
7667                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7668                         return -EINVAL;
7669
7670                 /* at least one branch bit must be set */
7671                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7672                         return -EINVAL;
7673
7674                 /* propagate priv level, when not set for branch */
7675                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7676
7677                         /* exclude_kernel checked on syscall entry */
7678                         if (!attr->exclude_kernel)
7679                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7680
7681                         if (!attr->exclude_user)
7682                                 mask |= PERF_SAMPLE_BRANCH_USER;
7683
7684                         if (!attr->exclude_hv)
7685                                 mask |= PERF_SAMPLE_BRANCH_HV;
7686                         /*
7687                          * adjust user setting (for HW filter setup)
7688                          */
7689                         attr->branch_sample_type = mask;
7690                 }
7691                 /* privileged levels capture (kernel, hv): check permissions */
7692                 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
7693                     && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7694                         return -EACCES;
7695         }
7696
7697         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
7698                 ret = perf_reg_validate(attr->sample_regs_user);
7699                 if (ret)
7700                         return ret;
7701         }
7702
7703         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7704                 if (!arch_perf_have_user_stack_dump())
7705                         return -ENOSYS;
7706
7707                 /*
7708                  * We have __u32 type for the size, but so far
7709                  * we can only use __u16 as maximum due to the
7710                  * __u16 sample size limit.
7711                  */
7712                 if (attr->sample_stack_user >= USHRT_MAX)
7713                         ret = -EINVAL;
7714                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7715                         ret = -EINVAL;
7716         }
7717
7718         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7719                 ret = perf_reg_validate(attr->sample_regs_intr);
7720 out:
7721         return ret;
7722
7723 err_size:
7724         put_user(sizeof(*attr), &uattr->size);
7725         ret = -E2BIG;
7726         goto out;
7727 }
7728
7729 static int
7730 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7731 {
7732         struct ring_buffer *rb = NULL;
7733         int ret = -EINVAL;
7734
7735         if (!output_event)
7736                 goto set;
7737
7738         /* don't allow circular references */
7739         if (event == output_event)
7740                 goto out;
7741
7742         /*
7743          * Don't allow cross-cpu buffers
7744          */
7745         if (output_event->cpu != event->cpu)
7746                 goto out;
7747
7748         /*
7749          * If its not a per-cpu rb, it must be the same task.
7750          */
7751         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7752                 goto out;
7753
7754         /*
7755          * Mixing clocks in the same buffer is trouble you don't need.
7756          */
7757         if (output_event->clock != event->clock)
7758                 goto out;
7759
7760         /*
7761          * If both events generate aux data, they must be on the same PMU
7762          */
7763         if (has_aux(event) && has_aux(output_event) &&
7764             event->pmu != output_event->pmu)
7765                 goto out;
7766
7767 set:
7768         mutex_lock(&event->mmap_mutex);
7769         /* Can't redirect output if we've got an active mmap() */
7770         if (atomic_read(&event->mmap_count))
7771                 goto unlock;
7772
7773         if (output_event) {
7774                 /* get the rb we want to redirect to */
7775                 rb = ring_buffer_get(output_event);
7776                 if (!rb)
7777                         goto unlock;
7778         }
7779
7780         ring_buffer_attach(event, rb);
7781
7782         ret = 0;
7783 unlock:
7784         mutex_unlock(&event->mmap_mutex);
7785
7786 out:
7787         return ret;
7788 }
7789
7790 static void mutex_lock_double(struct mutex *a, struct mutex *b)
7791 {
7792         if (b < a)
7793                 swap(a, b);
7794
7795         mutex_lock(a);
7796         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7797 }
7798
7799 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7800 {
7801         bool nmi_safe = false;
7802
7803         switch (clk_id) {
7804         case CLOCK_MONOTONIC:
7805                 event->clock = &ktime_get_mono_fast_ns;
7806                 nmi_safe = true;
7807                 break;
7808
7809         case CLOCK_MONOTONIC_RAW:
7810                 event->clock = &ktime_get_raw_fast_ns;
7811                 nmi_safe = true;
7812                 break;
7813
7814         case CLOCK_REALTIME:
7815                 event->clock = &ktime_get_real_ns;
7816                 break;
7817
7818         case CLOCK_BOOTTIME:
7819                 event->clock = &ktime_get_boot_ns;
7820                 break;
7821
7822         case CLOCK_TAI:
7823                 event->clock = &ktime_get_tai_ns;
7824                 break;
7825
7826         default:
7827                 return -EINVAL;
7828         }
7829
7830         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7831                 return -EINVAL;
7832
7833         return 0;
7834 }
7835
7836 /**
7837  * sys_perf_event_open - open a performance event, associate it to a task/cpu
7838  *
7839  * @attr_uptr:  event_id type attributes for monitoring/sampling
7840  * @pid:                target pid
7841  * @cpu:                target cpu
7842  * @group_fd:           group leader event fd
7843  */
7844 SYSCALL_DEFINE5(perf_event_open,
7845                 struct perf_event_attr __user *, attr_uptr,
7846                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
7847 {
7848         struct perf_event *group_leader = NULL, *output_event = NULL;
7849         struct perf_event *event, *sibling;
7850         struct perf_event_attr attr;
7851         struct perf_event_context *ctx, *uninitialized_var(gctx);
7852         struct file *event_file = NULL;
7853         struct fd group = {NULL, 0};
7854         struct task_struct *task = NULL;
7855         struct pmu *pmu;
7856         int event_fd;
7857         int move_group = 0;
7858         int err;
7859         int f_flags = O_RDWR;
7860         int cgroup_fd = -1;
7861
7862         /* for future expandability... */
7863         if (flags & ~PERF_FLAG_ALL)
7864                 return -EINVAL;
7865
7866         err = perf_copy_attr(attr_uptr, &attr);
7867         if (err)
7868                 return err;
7869
7870         if (!attr.exclude_kernel) {
7871                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7872                         return -EACCES;
7873         }
7874
7875         if (attr.freq) {
7876                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7877                         return -EINVAL;
7878         } else {
7879                 if (attr.sample_period & (1ULL << 63))
7880                         return -EINVAL;
7881         }
7882
7883         /*
7884          * In cgroup mode, the pid argument is used to pass the fd
7885          * opened to the cgroup directory in cgroupfs. The cpu argument
7886          * designates the cpu on which to monitor threads from that
7887          * cgroup.
7888          */
7889         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7890                 return -EINVAL;
7891
7892         if (flags & PERF_FLAG_FD_CLOEXEC)
7893                 f_flags |= O_CLOEXEC;
7894
7895         event_fd = get_unused_fd_flags(f_flags);
7896         if (event_fd < 0)
7897                 return event_fd;
7898
7899         if (group_fd != -1) {
7900                 err = perf_fget_light(group_fd, &group);
7901                 if (err)
7902                         goto err_fd;
7903                 group_leader = group.file->private_data;
7904                 if (flags & PERF_FLAG_FD_OUTPUT)
7905                         output_event = group_leader;
7906                 if (flags & PERF_FLAG_FD_NO_GROUP)
7907                         group_leader = NULL;
7908         }
7909
7910         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
7911                 task = find_lively_task_by_vpid(pid);
7912                 if (IS_ERR(task)) {
7913                         err = PTR_ERR(task);
7914                         goto err_group_fd;
7915                 }
7916         }
7917
7918         if (task && group_leader &&
7919             group_leader->attr.inherit != attr.inherit) {
7920                 err = -EINVAL;
7921                 goto err_task;
7922         }
7923
7924         get_online_cpus();
7925
7926         if (flags & PERF_FLAG_PID_CGROUP)
7927                 cgroup_fd = pid;
7928
7929         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7930                                  NULL, NULL, cgroup_fd);
7931         if (IS_ERR(event)) {
7932                 err = PTR_ERR(event);
7933                 goto err_cpus;
7934         }
7935
7936         if (is_sampling_event(event)) {
7937                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7938                         err = -ENOTSUPP;
7939                         goto err_alloc;
7940                 }
7941         }
7942
7943         account_event(event);
7944
7945         /*
7946          * Special case software events and allow them to be part of
7947          * any hardware group.
7948          */
7949         pmu = event->pmu;
7950
7951         if (attr.use_clockid) {
7952                 err = perf_event_set_clock(event, attr.clockid);
7953                 if (err)
7954                         goto err_alloc;
7955         }
7956
7957         if (group_leader &&
7958             (is_software_event(event) != is_software_event(group_leader))) {
7959                 if (is_software_event(event)) {
7960                         /*
7961                          * If event and group_leader are not both a software
7962                          * event, and event is, then group leader is not.
7963                          *
7964                          * Allow the addition of software events to !software
7965                          * groups, this is safe because software events never
7966                          * fail to schedule.
7967                          */
7968                         pmu = group_leader->pmu;
7969                 } else if (is_software_event(group_leader) &&
7970                            (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7971                         /*
7972                          * In case the group is a pure software group, and we
7973                          * try to add a hardware event, move the whole group to
7974                          * the hardware context.
7975                          */
7976                         move_group = 1;
7977                 }
7978         }
7979
7980         /*
7981          * Get the target context (task or percpu):
7982          */
7983         ctx = find_get_context(pmu, task, event);
7984         if (IS_ERR(ctx)) {
7985                 err = PTR_ERR(ctx);
7986                 goto err_alloc;
7987         }
7988
7989         if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
7990                 err = -EBUSY;
7991                 goto err_context;
7992         }
7993
7994         if (task) {
7995                 put_task_struct(task);
7996                 task = NULL;
7997         }
7998
7999         /*
8000          * Look up the group leader (we will attach this event to it):
8001          */
8002         if (group_leader) {
8003                 err = -EINVAL;
8004
8005                 /*
8006                  * Do not allow a recursive hierarchy (this new sibling
8007                  * becoming part of another group-sibling):
8008                  */
8009                 if (group_leader->group_leader != group_leader)
8010                         goto err_context;
8011
8012                 /* All events in a group should have the same clock */
8013                 if (group_leader->clock != event->clock)
8014                         goto err_context;
8015
8016                 /*
8017                  * Do not allow to attach to a group in a different
8018                  * task or CPU context:
8019                  */
8020                 if (move_group) {
8021                         /*
8022                          * Make sure we're both on the same task, or both
8023                          * per-cpu events.
8024                          */
8025                         if (group_leader->ctx->task != ctx->task)
8026                                 goto err_context;
8027
8028                         /*
8029                          * Make sure we're both events for the same CPU;
8030                          * grouping events for different CPUs is broken; since
8031                          * you can never concurrently schedule them anyhow.
8032                          */
8033                         if (group_leader->cpu != event->cpu)
8034                                 goto err_context;
8035                 } else {
8036                         if (group_leader->ctx != ctx)
8037                                 goto err_context;
8038                 }
8039
8040                 /*
8041                  * Only a group leader can be exclusive or pinned
8042                  */
8043                 if (attr.exclusive || attr.pinned)
8044                         goto err_context;
8045         }
8046
8047         if (output_event) {
8048                 err = perf_event_set_output(event, output_event);
8049                 if (err)
8050                         goto err_context;
8051         }
8052
8053         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8054                                         f_flags);
8055         if (IS_ERR(event_file)) {
8056                 err = PTR_ERR(event_file);
8057                 goto err_context;
8058         }
8059
8060         if (move_group) {
8061                 gctx = group_leader->ctx;
8062
8063                 /*
8064                  * See perf_event_ctx_lock() for comments on the details
8065                  * of swizzling perf_event::ctx.
8066                  */
8067                 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8068
8069                 perf_remove_from_context(group_leader, false);
8070
8071                 list_for_each_entry(sibling, &group_leader->sibling_list,
8072                                     group_entry) {
8073                         perf_remove_from_context(sibling, false);
8074                         put_ctx(gctx);
8075                 }
8076         } else {
8077                 mutex_lock(&ctx->mutex);
8078         }
8079
8080         WARN_ON_ONCE(ctx->parent_ctx);
8081
8082         if (move_group) {
8083                 /*
8084                  * Wait for everybody to stop referencing the events through
8085                  * the old lists, before installing it on new lists.
8086                  */
8087                 synchronize_rcu();
8088
8089                 /*
8090                  * Install the group siblings before the group leader.
8091                  *
8092                  * Because a group leader will try and install the entire group
8093                  * (through the sibling list, which is still in-tact), we can
8094                  * end up with siblings installed in the wrong context.
8095                  *
8096                  * By installing siblings first we NO-OP because they're not
8097                  * reachable through the group lists.
8098                  */
8099                 list_for_each_entry(sibling, &group_leader->sibling_list,
8100                                     group_entry) {
8101                         perf_event__state_init(sibling);
8102                         perf_install_in_context(ctx, sibling, sibling->cpu);
8103                         get_ctx(ctx);
8104                 }
8105
8106                 /*
8107                  * Removing from the context ends up with disabled
8108                  * event. What we want here is event in the initial
8109                  * startup state, ready to be add into new context.
8110                  */
8111                 perf_event__state_init(group_leader);
8112                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8113                 get_ctx(ctx);
8114         }
8115
8116         if (!exclusive_event_installable(event, ctx)) {
8117                 err = -EBUSY;
8118                 mutex_unlock(&ctx->mutex);
8119                 fput(event_file);
8120                 goto err_context;
8121         }
8122
8123         perf_install_in_context(ctx, event, event->cpu);
8124         perf_unpin_context(ctx);
8125
8126         if (move_group) {
8127                 mutex_unlock(&gctx->mutex);
8128                 put_ctx(gctx);
8129         }
8130         mutex_unlock(&ctx->mutex);
8131
8132         put_online_cpus();
8133
8134         event->owner = current;
8135
8136         mutex_lock(&current->perf_event_mutex);
8137         list_add_tail(&event->owner_entry, &current->perf_event_list);
8138         mutex_unlock(&current->perf_event_mutex);
8139
8140         /*
8141          * Precalculate sample_data sizes
8142          */
8143         perf_event__header_size(event);
8144         perf_event__id_header_size(event);
8145
8146         /*
8147          * Drop the reference on the group_event after placing the
8148          * new event on the sibling_list. This ensures destruction
8149          * of the group leader will find the pointer to itself in
8150          * perf_group_detach().
8151          */
8152         fdput(group);
8153         fd_install(event_fd, event_file);
8154         return event_fd;
8155
8156 err_context:
8157         perf_unpin_context(ctx);
8158         put_ctx(ctx);
8159 err_alloc:
8160         free_event(event);
8161 err_cpus:
8162         put_online_cpus();
8163 err_task:
8164         if (task)
8165                 put_task_struct(task);
8166 err_group_fd:
8167         fdput(group);
8168 err_fd:
8169         put_unused_fd(event_fd);
8170         return err;
8171 }
8172
8173 /**
8174  * perf_event_create_kernel_counter
8175  *
8176  * @attr: attributes of the counter to create
8177  * @cpu: cpu in which the counter is bound
8178  * @task: task to profile (NULL for percpu)
8179  */
8180 struct perf_event *
8181 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8182                                  struct task_struct *task,
8183                                  perf_overflow_handler_t overflow_handler,
8184                                  void *context)
8185 {
8186         struct perf_event_context *ctx;
8187         struct perf_event *event;
8188         int err;
8189
8190         /*
8191          * Get the target context (task or percpu):
8192          */
8193
8194         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
8195                                  overflow_handler, context, -1);
8196         if (IS_ERR(event)) {
8197                 err = PTR_ERR(event);
8198                 goto err;
8199         }
8200
8201         /* Mark owner so we could distinguish it from user events. */
8202         event->owner = EVENT_OWNER_KERNEL;
8203
8204         account_event(event);
8205
8206         ctx = find_get_context(event->pmu, task, event);
8207         if (IS_ERR(ctx)) {
8208                 err = PTR_ERR(ctx);
8209                 goto err_free;
8210         }
8211
8212         WARN_ON_ONCE(ctx->parent_ctx);
8213         mutex_lock(&ctx->mutex);
8214         if (!exclusive_event_installable(event, ctx)) {
8215                 mutex_unlock(&ctx->mutex);
8216                 perf_unpin_context(ctx);
8217                 put_ctx(ctx);
8218                 err = -EBUSY;
8219                 goto err_free;
8220         }
8221
8222         perf_install_in_context(ctx, event, cpu);
8223         perf_unpin_context(ctx);
8224         mutex_unlock(&ctx->mutex);
8225
8226         return event;
8227
8228 err_free:
8229         free_event(event);
8230 err:
8231         return ERR_PTR(err);
8232 }
8233 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8234
8235 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8236 {
8237         struct perf_event_context *src_ctx;
8238         struct perf_event_context *dst_ctx;
8239         struct perf_event *event, *tmp;
8240         LIST_HEAD(events);
8241
8242         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8243         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8244
8245         /*
8246          * See perf_event_ctx_lock() for comments on the details
8247          * of swizzling perf_event::ctx.
8248          */
8249         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8250         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8251                                  event_entry) {
8252                 perf_remove_from_context(event, false);
8253                 unaccount_event_cpu(event, src_cpu);
8254                 put_ctx(src_ctx);
8255                 list_add(&event->migrate_entry, &events);
8256         }
8257
8258         /*
8259          * Wait for the events to quiesce before re-instating them.
8260          */
8261         synchronize_rcu();
8262
8263         /*
8264          * Re-instate events in 2 passes.
8265          *
8266          * Skip over group leaders and only install siblings on this first
8267          * pass, siblings will not get enabled without a leader, however a
8268          * leader will enable its siblings, even if those are still on the old
8269          * context.
8270          */
8271         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8272                 if (event->group_leader == event)
8273                         continue;
8274
8275                 list_del(&event->migrate_entry);
8276                 if (event->state >= PERF_EVENT_STATE_OFF)
8277                         event->state = PERF_EVENT_STATE_INACTIVE;
8278                 account_event_cpu(event, dst_cpu);
8279                 perf_install_in_context(dst_ctx, event, dst_cpu);
8280                 get_ctx(dst_ctx);
8281         }
8282
8283         /*
8284          * Once all the siblings are setup properly, install the group leaders
8285          * to make it go.
8286          */
8287         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8288                 list_del(&event->migrate_entry);
8289                 if (event->state >= PERF_EVENT_STATE_OFF)
8290                         event->state = PERF_EVENT_STATE_INACTIVE;
8291                 account_event_cpu(event, dst_cpu);
8292                 perf_install_in_context(dst_ctx, event, dst_cpu);
8293                 get_ctx(dst_ctx);
8294         }
8295         mutex_unlock(&dst_ctx->mutex);
8296         mutex_unlock(&src_ctx->mutex);
8297 }
8298 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8299
8300 static void sync_child_event(struct perf_event *child_event,
8301                                struct task_struct *child)
8302 {
8303         struct perf_event *parent_event = child_event->parent;
8304         u64 child_val;
8305
8306         if (child_event->attr.inherit_stat)
8307                 perf_event_read_event(child_event, child);
8308
8309         child_val = perf_event_count(child_event);
8310
8311         /*
8312          * Add back the child's count to the parent's count:
8313          */
8314         atomic64_add(child_val, &parent_event->child_count);
8315         atomic64_add(child_event->total_time_enabled,
8316                      &parent_event->child_total_time_enabled);
8317         atomic64_add(child_event->total_time_running,
8318                      &parent_event->child_total_time_running);
8319
8320         /*
8321          * Remove this event from the parent's list
8322          */
8323         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8324         mutex_lock(&parent_event->child_mutex);
8325         list_del_init(&child_event->child_list);
8326         mutex_unlock(&parent_event->child_mutex);
8327
8328         /*
8329          * Make sure user/parent get notified, that we just
8330          * lost one event.
8331          */
8332         perf_event_wakeup(parent_event);
8333
8334         /*
8335          * Release the parent event, if this was the last
8336          * reference to it.
8337          */
8338         put_event(parent_event);
8339 }
8340
8341 static void
8342 __perf_event_exit_task(struct perf_event *child_event,
8343                          struct perf_event_context *child_ctx,
8344                          struct task_struct *child)
8345 {
8346         /*
8347          * Do not destroy the 'original' grouping; because of the context
8348          * switch optimization the original events could've ended up in a
8349          * random child task.
8350          *
8351          * If we were to destroy the original group, all group related
8352          * operations would cease to function properly after this random
8353          * child dies.
8354          *
8355          * Do destroy all inherited groups, we don't care about those
8356          * and being thorough is better.
8357          */
8358         perf_remove_from_context(child_event, !!child_event->parent);
8359
8360         /*
8361          * It can happen that the parent exits first, and has events
8362          * that are still around due to the child reference. These
8363          * events need to be zapped.
8364          */
8365         if (child_event->parent) {
8366                 sync_child_event(child_event, child);
8367                 free_event(child_event);
8368         } else {
8369                 child_event->state = PERF_EVENT_STATE_EXIT;
8370                 perf_event_wakeup(child_event);
8371         }
8372 }
8373
8374 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8375 {
8376         struct perf_event *child_event, *next;
8377         struct perf_event_context *child_ctx, *clone_ctx = NULL;
8378         unsigned long flags;
8379
8380         if (likely(!child->perf_event_ctxp[ctxn])) {
8381                 perf_event_task(child, NULL, 0);
8382                 return;
8383         }
8384
8385         local_irq_save(flags);
8386         /*
8387          * We can't reschedule here because interrupts are disabled,
8388          * and either child is current or it is a task that can't be
8389          * scheduled, so we are now safe from rescheduling changing
8390          * our context.
8391          */
8392         child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8393
8394         /*
8395          * Take the context lock here so that if find_get_context is
8396          * reading child->perf_event_ctxp, we wait until it has
8397          * incremented the context's refcount before we do put_ctx below.
8398          */
8399         raw_spin_lock(&child_ctx->lock);
8400         task_ctx_sched_out(child_ctx);
8401         child->perf_event_ctxp[ctxn] = NULL;
8402
8403         /*
8404          * If this context is a clone; unclone it so it can't get
8405          * swapped to another process while we're removing all
8406          * the events from it.
8407          */
8408         clone_ctx = unclone_ctx(child_ctx);
8409         update_context_time(child_ctx);
8410         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8411
8412         if (clone_ctx)
8413                 put_ctx(clone_ctx);
8414
8415         /*
8416          * Report the task dead after unscheduling the events so that we
8417          * won't get any samples after PERF_RECORD_EXIT. We can however still
8418          * get a few PERF_RECORD_READ events.
8419          */
8420         perf_event_task(child, child_ctx, 0);
8421
8422         /*
8423          * We can recurse on the same lock type through:
8424          *
8425          *   __perf_event_exit_task()
8426          *     sync_child_event()
8427          *       put_event()
8428          *         mutex_lock(&ctx->mutex)
8429          *
8430          * But since its the parent context it won't be the same instance.
8431          */
8432         mutex_lock(&child_ctx->mutex);
8433
8434         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8435                 __perf_event_exit_task(child_event, child_ctx, child);
8436
8437         mutex_unlock(&child_ctx->mutex);
8438
8439         put_ctx(child_ctx);
8440 }
8441
8442 /*
8443  * When a child task exits, feed back event values to parent events.
8444  */
8445 void perf_event_exit_task(struct task_struct *child)
8446 {
8447         struct perf_event *event, *tmp;
8448         int ctxn;
8449
8450         mutex_lock(&child->perf_event_mutex);
8451         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8452                                  owner_entry) {
8453                 list_del_init(&event->owner_entry);
8454
8455                 /*
8456                  * Ensure the list deletion is visible before we clear
8457                  * the owner, closes a race against perf_release() where
8458                  * we need to serialize on the owner->perf_event_mutex.
8459                  */
8460                 smp_wmb();
8461                 event->owner = NULL;
8462         }
8463         mutex_unlock(&child->perf_event_mutex);
8464
8465         for_each_task_context_nr(ctxn)
8466                 perf_event_exit_task_context(child, ctxn);
8467 }
8468
8469 static void perf_free_event(struct perf_event *event,
8470                             struct perf_event_context *ctx)
8471 {
8472         struct perf_event *parent = event->parent;
8473
8474         if (WARN_ON_ONCE(!parent))
8475                 return;
8476
8477         mutex_lock(&parent->child_mutex);
8478         list_del_init(&event->child_list);
8479         mutex_unlock(&parent->child_mutex);
8480
8481         put_event(parent);
8482
8483         raw_spin_lock_irq(&ctx->lock);
8484         perf_group_detach(event);
8485         list_del_event(event, ctx);
8486         raw_spin_unlock_irq(&ctx->lock);
8487         free_event(event);
8488 }
8489
8490 /*
8491  * Free an unexposed, unused context as created by inheritance by
8492  * perf_event_init_task below, used by fork() in case of fail.
8493  *
8494  * Not all locks are strictly required, but take them anyway to be nice and
8495  * help out with the lockdep assertions.
8496  */
8497 void perf_event_free_task(struct task_struct *task)
8498 {
8499         struct perf_event_context *ctx;
8500         struct perf_event *event, *tmp;
8501         int ctxn;
8502
8503         for_each_task_context_nr(ctxn) {
8504                 ctx = task->perf_event_ctxp[ctxn];
8505                 if (!ctx)
8506                         continue;
8507
8508                 mutex_lock(&ctx->mutex);
8509 again:
8510                 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8511                                 group_entry)
8512                         perf_free_event(event, ctx);
8513
8514                 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8515                                 group_entry)
8516                         perf_free_event(event, ctx);
8517
8518                 if (!list_empty(&ctx->pinned_groups) ||
8519                                 !list_empty(&ctx->flexible_groups))
8520                         goto again;
8521
8522                 mutex_unlock(&ctx->mutex);
8523
8524                 put_ctx(ctx);
8525         }
8526 }
8527
8528 void perf_event_delayed_put(struct task_struct *task)
8529 {
8530         int ctxn;
8531
8532         for_each_task_context_nr(ctxn)
8533                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8534 }
8535
8536 /*
8537  * inherit a event from parent task to child task:
8538  */
8539 static struct perf_event *
8540 inherit_event(struct perf_event *parent_event,
8541               struct task_struct *parent,
8542               struct perf_event_context *parent_ctx,
8543               struct task_struct *child,
8544               struct perf_event *group_leader,
8545               struct perf_event_context *child_ctx)
8546 {
8547         enum perf_event_active_state parent_state = parent_event->state;
8548         struct perf_event *child_event;
8549         unsigned long flags;
8550
8551         /*
8552          * Instead of creating recursive hierarchies of events,
8553          * we link inherited events back to the original parent,
8554          * which has a filp for sure, which we use as the reference
8555          * count:
8556          */
8557         if (parent_event->parent)
8558                 parent_event = parent_event->parent;
8559
8560         child_event = perf_event_alloc(&parent_event->attr,
8561                                            parent_event->cpu,
8562                                            child,
8563                                            group_leader, parent_event,
8564                                            NULL, NULL, -1);
8565         if (IS_ERR(child_event))
8566                 return child_event;
8567
8568         if (is_orphaned_event(parent_event) ||
8569             !atomic_long_inc_not_zero(&parent_event->refcount)) {
8570                 free_event(child_event);
8571                 return NULL;
8572         }
8573
8574         get_ctx(child_ctx);
8575
8576         /*
8577          * Make the child state follow the state of the parent event,
8578          * not its attr.disabled bit.  We hold the parent's mutex,
8579          * so we won't race with perf_event_{en, dis}able_family.
8580          */
8581         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
8582                 child_event->state = PERF_EVENT_STATE_INACTIVE;
8583         else
8584                 child_event->state = PERF_EVENT_STATE_OFF;
8585
8586         if (parent_event->attr.freq) {
8587                 u64 sample_period = parent_event->hw.sample_period;
8588                 struct hw_perf_event *hwc = &child_event->hw;
8589
8590                 hwc->sample_period = sample_period;
8591                 hwc->last_period   = sample_period;
8592
8593                 local64_set(&hwc->period_left, sample_period);
8594         }
8595
8596         child_event->ctx = child_ctx;
8597         child_event->overflow_handler = parent_event->overflow_handler;
8598         child_event->overflow_handler_context
8599                 = parent_event->overflow_handler_context;
8600
8601         /*
8602          * Precalculate sample_data sizes
8603          */
8604         perf_event__header_size(child_event);
8605         perf_event__id_header_size(child_event);
8606
8607         /*
8608          * Link it up in the child's context:
8609          */
8610         raw_spin_lock_irqsave(&child_ctx->lock, flags);
8611         add_event_to_ctx(child_event, child_ctx);
8612         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8613
8614         /*
8615          * Link this into the parent event's child list
8616          */
8617         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8618         mutex_lock(&parent_event->child_mutex);
8619         list_add_tail(&child_event->child_list, &parent_event->child_list);
8620         mutex_unlock(&parent_event->child_mutex);
8621
8622         return child_event;
8623 }
8624
8625 static int inherit_group(struct perf_event *parent_event,
8626               struct task_struct *parent,
8627               struct perf_event_context *parent_ctx,
8628               struct task_struct *child,
8629               struct perf_event_context *child_ctx)
8630 {
8631         struct perf_event *leader;
8632         struct perf_event *sub;
8633         struct perf_event *child_ctr;
8634
8635         leader = inherit_event(parent_event, parent, parent_ctx,
8636                                  child, NULL, child_ctx);
8637         if (IS_ERR(leader))
8638                 return PTR_ERR(leader);
8639         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8640                 child_ctr = inherit_event(sub, parent, parent_ctx,
8641                                             child, leader, child_ctx);
8642                 if (IS_ERR(child_ctr))
8643                         return PTR_ERR(child_ctr);
8644         }
8645         return 0;
8646 }
8647
8648 static int
8649 inherit_task_group(struct perf_event *event, struct task_struct *parent,
8650                    struct perf_event_context *parent_ctx,
8651                    struct task_struct *child, int ctxn,
8652                    int *inherited_all)
8653 {
8654         int ret;
8655         struct perf_event_context *child_ctx;
8656
8657         if (!event->attr.inherit) {
8658                 *inherited_all = 0;
8659                 return 0;
8660         }
8661
8662         child_ctx = child->perf_event_ctxp[ctxn];
8663         if (!child_ctx) {
8664                 /*
8665                  * This is executed from the parent task context, so
8666                  * inherit events that have been marked for cloning.
8667                  * First allocate and initialize a context for the
8668                  * child.
8669                  */
8670
8671                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
8672                 if (!child_ctx)
8673                         return -ENOMEM;
8674
8675                 child->perf_event_ctxp[ctxn] = child_ctx;
8676         }
8677
8678         ret = inherit_group(event, parent, parent_ctx,
8679                             child, child_ctx);
8680
8681         if (ret)
8682                 *inherited_all = 0;
8683
8684         return ret;
8685 }
8686
8687 /*
8688  * Initialize the perf_event context in task_struct
8689  */
8690 static int perf_event_init_context(struct task_struct *child, int ctxn)
8691 {
8692         struct perf_event_context *child_ctx, *parent_ctx;
8693         struct perf_event_context *cloned_ctx;
8694         struct perf_event *event;
8695         struct task_struct *parent = current;
8696         int inherited_all = 1;
8697         unsigned long flags;
8698         int ret = 0;
8699
8700         if (likely(!parent->perf_event_ctxp[ctxn]))
8701                 return 0;
8702
8703         /*
8704          * If the parent's context is a clone, pin it so it won't get
8705          * swapped under us.
8706          */
8707         parent_ctx = perf_pin_task_context(parent, ctxn);
8708         if (!parent_ctx)
8709                 return 0;
8710
8711         /*
8712          * No need to check if parent_ctx != NULL here; since we saw
8713          * it non-NULL earlier, the only reason for it to become NULL
8714          * is if we exit, and since we're currently in the middle of
8715          * a fork we can't be exiting at the same time.
8716          */
8717
8718         /*
8719          * Lock the parent list. No need to lock the child - not PID
8720          * hashed yet and not running, so nobody can access it.
8721          */
8722         mutex_lock(&parent_ctx->mutex);
8723
8724         /*
8725          * We dont have to disable NMIs - we are only looking at
8726          * the list, not manipulating it:
8727          */
8728         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8729                 ret = inherit_task_group(event, parent, parent_ctx,
8730                                          child, ctxn, &inherited_all);
8731                 if (ret)
8732                         break;
8733         }
8734
8735         /*
8736          * We can't hold ctx->lock when iterating the ->flexible_group list due
8737          * to allocations, but we need to prevent rotation because
8738          * rotate_ctx() will change the list from interrupt context.
8739          */
8740         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8741         parent_ctx->rotate_disable = 1;
8742         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8743
8744         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8745                 ret = inherit_task_group(event, parent, parent_ctx,
8746                                          child, ctxn, &inherited_all);
8747                 if (ret)
8748                         break;
8749         }
8750
8751         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8752         parent_ctx->rotate_disable = 0;
8753
8754         child_ctx = child->perf_event_ctxp[ctxn];
8755
8756         if (child_ctx && inherited_all) {
8757                 /*
8758                  * Mark the child context as a clone of the parent
8759                  * context, or of whatever the parent is a clone of.
8760                  *
8761                  * Note that if the parent is a clone, the holding of
8762                  * parent_ctx->lock avoids it from being uncloned.
8763                  */
8764                 cloned_ctx = parent_ctx->parent_ctx;
8765                 if (cloned_ctx) {
8766                         child_ctx->parent_ctx = cloned_ctx;
8767                         child_ctx->parent_gen = parent_ctx->parent_gen;
8768                 } else {
8769                         child_ctx->parent_ctx = parent_ctx;
8770                         child_ctx->parent_gen = parent_ctx->generation;
8771                 }
8772                 get_ctx(child_ctx->parent_ctx);
8773         }
8774
8775         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8776         mutex_unlock(&parent_ctx->mutex);
8777
8778         perf_unpin_context(parent_ctx);
8779         put_ctx(parent_ctx);
8780
8781         return ret;
8782 }
8783
8784 /*
8785  * Initialize the perf_event context in task_struct
8786  */
8787 int perf_event_init_task(struct task_struct *child)
8788 {
8789         int ctxn, ret;
8790
8791         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8792         mutex_init(&child->perf_event_mutex);
8793         INIT_LIST_HEAD(&child->perf_event_list);
8794
8795         for_each_task_context_nr(ctxn) {
8796                 ret = perf_event_init_context(child, ctxn);
8797                 if (ret) {
8798                         perf_event_free_task(child);
8799                         return ret;
8800                 }
8801         }
8802
8803         return 0;
8804 }
8805
8806 static void __init perf_event_init_all_cpus(void)
8807 {
8808         struct swevent_htable *swhash;
8809         int cpu;
8810
8811         for_each_possible_cpu(cpu) {
8812                 swhash = &per_cpu(swevent_htable, cpu);
8813                 mutex_init(&swhash->hlist_mutex);
8814                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8815         }
8816 }
8817
8818 static void perf_event_init_cpu(int cpu)
8819 {
8820         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8821
8822         mutex_lock(&swhash->hlist_mutex);
8823         swhash->online = true;
8824         if (swhash->hlist_refcount > 0) {
8825                 struct swevent_hlist *hlist;
8826
8827                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8828                 WARN_ON(!hlist);
8829                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8830         }
8831         mutex_unlock(&swhash->hlist_mutex);
8832 }
8833
8834 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8835 static void __perf_event_exit_context(void *__info)
8836 {
8837         struct remove_event re = { .detach_group = true };
8838         struct perf_event_context *ctx = __info;
8839
8840         rcu_read_lock();
8841         list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8842                 __perf_remove_from_context(&re);
8843         rcu_read_unlock();
8844 }
8845
8846 static void perf_event_exit_cpu_context(int cpu)
8847 {
8848         struct perf_event_context *ctx;
8849         struct pmu *pmu;
8850         int idx;
8851
8852         idx = srcu_read_lock(&pmus_srcu);
8853         list_for_each_entry_rcu(pmu, &pmus, entry) {
8854                 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
8855
8856                 mutex_lock(&ctx->mutex);
8857                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8858                 mutex_unlock(&ctx->mutex);
8859         }
8860         srcu_read_unlock(&pmus_srcu, idx);
8861 }
8862
8863 static void perf_event_exit_cpu(int cpu)
8864 {
8865         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8866
8867         perf_event_exit_cpu_context(cpu);
8868
8869         mutex_lock(&swhash->hlist_mutex);
8870         swhash->online = false;
8871         swevent_hlist_release(swhash);
8872         mutex_unlock(&swhash->hlist_mutex);
8873 }
8874 #else
8875 static inline void perf_event_exit_cpu(int cpu) { }
8876 #endif
8877
8878 static int
8879 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8880 {
8881         int cpu;
8882
8883         for_each_online_cpu(cpu)
8884                 perf_event_exit_cpu(cpu);
8885
8886         return NOTIFY_OK;
8887 }
8888
8889 /*
8890  * Run the perf reboot notifier at the very last possible moment so that
8891  * the generic watchdog code runs as long as possible.
8892  */
8893 static struct notifier_block perf_reboot_notifier = {
8894         .notifier_call = perf_reboot,
8895         .priority = INT_MIN,
8896 };
8897
8898 static int
8899 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8900 {
8901         unsigned int cpu = (long)hcpu;
8902
8903         switch (action & ~CPU_TASKS_FROZEN) {
8904
8905         case CPU_UP_PREPARE:
8906         case CPU_DOWN_FAILED:
8907                 perf_event_init_cpu(cpu);
8908                 break;
8909
8910         case CPU_UP_CANCELED:
8911         case CPU_DOWN_PREPARE:
8912                 perf_event_exit_cpu(cpu);
8913                 break;
8914         default:
8915                 break;
8916         }
8917
8918         return NOTIFY_OK;
8919 }
8920
8921 void __init perf_event_init(void)
8922 {
8923         int ret;
8924
8925         idr_init(&pmu_idr);
8926
8927         perf_event_init_all_cpus();
8928         init_srcu_struct(&pmus_srcu);
8929         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8930         perf_pmu_register(&perf_cpu_clock, NULL, -1);
8931         perf_pmu_register(&perf_task_clock, NULL, -1);
8932         perf_tp_register();
8933         perf_cpu_notifier(perf_cpu_notify);
8934         register_reboot_notifier(&perf_reboot_notifier);
8935
8936         ret = init_hw_breakpoint();
8937         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
8938
8939         /* do not patch jump label more than once per second */
8940         jump_label_rate_limit(&perf_sched_events, HZ);
8941
8942         /*
8943          * Build time assertion that we keep the data_head at the intended
8944          * location.  IOW, validation we got the __reserved[] size right.
8945          */
8946         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8947                      != 1024);
8948 }
8949
8950 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8951                               char *page)
8952 {
8953         struct perf_pmu_events_attr *pmu_attr =
8954                 container_of(attr, struct perf_pmu_events_attr, attr);
8955
8956         if (pmu_attr->event_str)
8957                 return sprintf(page, "%s\n", pmu_attr->event_str);
8958
8959         return 0;
8960 }
8961
8962 static int __init perf_event_sysfs_init(void)
8963 {
8964         struct pmu *pmu;
8965         int ret;
8966
8967         mutex_lock(&pmus_lock);
8968
8969         ret = bus_register(&pmu_bus);
8970         if (ret)
8971                 goto unlock;
8972
8973         list_for_each_entry(pmu, &pmus, entry) {
8974                 if (!pmu->name || pmu->type < 0)
8975                         continue;
8976
8977                 ret = pmu_dev_alloc(pmu);
8978                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8979         }
8980         pmu_bus_running = 1;
8981         ret = 0;
8982
8983 unlock:
8984         mutex_unlock(&pmus_lock);
8985
8986         return ret;
8987 }
8988 device_initcall(perf_event_sysfs_init);
8989
8990 #ifdef CONFIG_CGROUP_PERF
8991 static struct cgroup_subsys_state *
8992 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8993 {
8994         struct perf_cgroup *jc;
8995
8996         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
8997         if (!jc)
8998                 return ERR_PTR(-ENOMEM);
8999
9000         jc->info = alloc_percpu(struct perf_cgroup_info);
9001         if (!jc->info) {
9002                 kfree(jc);
9003                 return ERR_PTR(-ENOMEM);
9004         }
9005
9006         return &jc->css;
9007 }
9008
9009 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
9010 {
9011         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9012
9013         free_percpu(jc->info);
9014         kfree(jc);
9015 }
9016
9017 static int __perf_cgroup_move(void *info)
9018 {
9019         struct task_struct *task = info;
9020         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9021         return 0;
9022 }
9023
9024 static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9025                                struct cgroup_taskset *tset)
9026 {
9027         struct task_struct *task;
9028
9029         cgroup_taskset_for_each(task, tset)
9030                 task_function_call(task, __perf_cgroup_move, task);
9031 }
9032
9033 static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9034                              struct cgroup_subsys_state *old_css,
9035                              struct task_struct *task)
9036 {
9037         /*
9038          * cgroup_exit() is called in the copy_process() failure path.
9039          * Ignore this case since the task hasn't ran yet, this avoids
9040          * trying to poke a half freed task state from generic code.
9041          */
9042         if (!(task->flags & PF_EXITING))
9043                 return;
9044
9045         task_function_call(task, __perf_cgroup_move, task);
9046 }
9047
9048 struct cgroup_subsys perf_event_cgrp_subsys = {
9049         .css_alloc      = perf_cgroup_css_alloc,
9050         .css_free       = perf_cgroup_css_free,
9051         .exit           = perf_cgroup_exit,
9052         .attach         = perf_cgroup_attach,
9053 };
9054 #endif /* CONFIG_CGROUP_PERF */