]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
Merge branch 'linus' into perf/urgent, to pick up dependent commits
authorIngo Molnar <mingo@kernel.org>
Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
committerIngo Molnar <mingo@kernel.org>
Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
arch/x86/events/intel/core.c
kernel/events/core.c

index f94855000d4ed02677ae5a46a3e622bf54a5f0b2,43445da30ceab12323772e81c95f0dfb3ba8cfa3..09c26a4f139c125e000675689ebc983acd8ab91a
@@@ -2958,6 -2958,10 +2958,10 @@@ static unsigned long intel_pmu_free_run
  
        if (event->attr.use_clockid)
                flags &= ~PERF_SAMPLE_TIME;
+       if (!event->attr.exclude_kernel)
+               flags &= ~PERF_SAMPLE_REGS_USER;
+       if (event->attr.sample_regs_user & ~PEBS_REGS)
+               flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
        return flags;
  }
  
@@@ -3730,19 -3734,6 +3734,19 @@@ EVENT_ATTR_STR(cycles-t,      cycles_t,       "eve
  EVENT_ATTR_STR(cycles-ct,     cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
  
  static struct attribute *hsw_events_attrs[] = {
 +      EVENT_PTR(mem_ld_hsw),
 +      EVENT_PTR(mem_st_hsw),
 +      EVENT_PTR(td_slots_issued),
 +      EVENT_PTR(td_slots_retired),
 +      EVENT_PTR(td_fetch_bubbles),
 +      EVENT_PTR(td_total_slots),
 +      EVENT_PTR(td_total_slots_scale),
 +      EVENT_PTR(td_recovery_bubbles),
 +      EVENT_PTR(td_recovery_bubbles_scale),
 +      NULL
 +};
 +
 +static struct attribute *hsw_tsx_events_attrs[] = {
        EVENT_PTR(tx_start),
        EVENT_PTR(tx_commit),
        EVENT_PTR(tx_abort),
        EVENT_PTR(el_conflict),
        EVENT_PTR(cycles_t),
        EVENT_PTR(cycles_ct),
 -      EVENT_PTR(mem_ld_hsw),
 -      EVENT_PTR(mem_st_hsw),
 -      EVENT_PTR(td_slots_issued),
 -      EVENT_PTR(td_slots_retired),
 -      EVENT_PTR(td_fetch_bubbles),
 -      EVENT_PTR(td_total_slots),
 -      EVENT_PTR(td_total_slots_scale),
 -      EVENT_PTR(td_recovery_bubbles),
 -      EVENT_PTR(td_recovery_bubbles_scale),
        NULL
  };
  
 +static __init struct attribute **get_hsw_events_attrs(void)
 +{
 +      return boot_cpu_has(X86_FEATURE_RTM) ?
 +              merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) :
 +              hsw_events_attrs;
 +}
 +
  static ssize_t freeze_on_smi_show(struct device *cdev,
                                  struct device_attribute *attr,
                                  char *buf)
@@@ -4193,7 -4186,7 +4197,7 @@@ __init int intel_pmu_init(void
  
                x86_pmu.hw_config = hsw_hw_config;
                x86_pmu.get_event_constraints = hsw_get_event_constraints;
 -              x86_pmu.cpu_events = hsw_events_attrs;
 +              x86_pmu.cpu_events = get_hsw_events_attrs();
                x86_pmu.lbr_double_abort = true;
                extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
                        hsw_format_attr : nhm_format_attr;
  
                x86_pmu.hw_config = hsw_hw_config;
                x86_pmu.get_event_constraints = hsw_get_event_constraints;
 -              x86_pmu.cpu_events = hsw_events_attrs;
 +              x86_pmu.cpu_events = get_hsw_events_attrs();
                x86_pmu.limit_period = bdw_limit_period;
                extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
                        hsw_format_attr : nhm_format_attr;
                extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
                        hsw_format_attr : nhm_format_attr;
                extra_attr = merge_attr(extra_attr, skl_format_attr);
 -              x86_pmu.cpu_events = hsw_events_attrs;
 +              x86_pmu.cpu_events = get_hsw_events_attrs();
                intel_pmu_pebs_data_source_skl(
                        boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
                pr_cont("Skylake events, ");
diff --combined kernel/events/core.c
index ab5ac84f82e20e60efdc55613680cefc4b562117,4c39c05e029a7f357cb872ffa982d6ca2381fddb..6ceb10d87462ee9521ddf11d4880f09775fa8ff0
@@@ -209,7 -209,7 +209,7 @@@ static int event_function(void *info
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;
  
-       WARN_ON_ONCE(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
  
        perf_ctx_lock(cpuctx, task_ctx);
        /*
@@@ -306,7 -306,7 +306,7 @@@ static void event_function_local(struc
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;
  
-       WARN_ON_ONCE(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
  
        if (task) {
                if (task == TASK_TOMBSTONE)
@@@ -582,6 -582,88 +582,88 @@@ static inline u64 perf_event_clock(stru
        return event->clock();
  }
  
+ /*
+  * State based event timekeeping...
+  *
+  * The basic idea is to use event->state to determine which (if any) time
+  * fields to increment with the current delta. This means we only need to
+  * update timestamps when we change state or when they are explicitly requested
+  * (read).
+  *
+  * Event groups make things a little more complicated, but not terribly so. The
+  * rules for a group are that if the group leader is OFF the entire group is
+  * OFF, irrespecive of what the group member states are. This results in
+  * __perf_effective_state().
+  *
+  * A futher ramification is that when a group leader flips between OFF and
+  * !OFF, we need to update all group member times.
+  *
+  *
+  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+  * need to make sure the relevant context time is updated before we try and
+  * update our timestamps.
+  */
+ static __always_inline enum perf_event_state
+ __perf_effective_state(struct perf_event *event)
+ {
+       struct perf_event *leader = event->group_leader;
+       if (leader->state <= PERF_EVENT_STATE_OFF)
+               return leader->state;
+       return event->state;
+ }
+ static __always_inline void
+ __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+ {
+       enum perf_event_state state = __perf_effective_state(event);
+       u64 delta = now - event->tstamp;
+       *enabled = event->total_time_enabled;
+       if (state >= PERF_EVENT_STATE_INACTIVE)
+               *enabled += delta;
+       *running = event->total_time_running;
+       if (state >= PERF_EVENT_STATE_ACTIVE)
+               *running += delta;
+ }
+ static void perf_event_update_time(struct perf_event *event)
+ {
+       u64 now = perf_event_time(event);
+       __perf_update_times(event, now, &event->total_time_enabled,
+                                       &event->total_time_running);
+       event->tstamp = now;
+ }
+ static void perf_event_update_sibling_time(struct perf_event *leader)
+ {
+       struct perf_event *sibling;
+       list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+               perf_event_update_time(sibling);
+ }
+ static void
+ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+ {
+       if (event->state == state)
+               return;
+       perf_event_update_time(event);
+       /*
+        * If a group leader gets enabled/disabled all its siblings
+        * are affected too.
+        */
+       if ((event->state < 0) ^ (state < 0))
+               perf_event_update_sibling_time(event);
+       WRITE_ONCE(event->state, state);
+ }
  #ifdef CONFIG_CGROUP_PERF
  
  static inline bool
@@@ -841,40 -923,6 +923,6 @@@ perf_cgroup_set_shadow_time(struct perf
        event->shadow_ctx_time = now - t->timestamp;
  }
  
- static inline void
- perf_cgroup_defer_enabled(struct perf_event *event)
- {
-       /*
-        * when the current task's perf cgroup does not match
-        * the event's, we need to remember to call the
-        * perf_mark_enable() function the first time a task with
-        * a matching perf cgroup is scheduled in.
-        */
-       if (is_cgroup_event(event) && !perf_cgroup_match(event))
-               event->cgrp_defer_enabled = 1;
- }
- static inline void
- perf_cgroup_mark_enabled(struct perf_event *event,
-                        struct perf_event_context *ctx)
- {
-       struct perf_event *sub;
-       u64 tstamp = perf_event_time(event);
-       if (!event->cgrp_defer_enabled)
-               return;
-       event->cgrp_defer_enabled = 0;
-       event->tstamp_enabled = tstamp - event->total_time_enabled;
-       list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-                       sub->cgrp_defer_enabled = 0;
-               }
-       }
- }
  /*
   * Update cpuctx->cgrp so that it is set when first cgroup event is added and
   * cleared when last cgroup event is removed.
@@@ -974,17 -1022,6 +1022,6 @@@ static inline u64 perf_cgroup_event_tim
        return 0;
  }
  
- static inline void
- perf_cgroup_defer_enabled(struct perf_event *event)
- {
- }
- static inline void
- perf_cgroup_mark_enabled(struct perf_event *event,
-                        struct perf_event_context *ctx)
- {
- }
  static inline void
  list_update_cgroup_event(struct perf_event *event,
                         struct perf_event_context *ctx, bool add)
@@@ -1006,7 -1043,7 +1043,7 @@@ static enum hrtimer_restart perf_mux_hr
        struct perf_cpu_context *cpuctx;
        int rotations = 0;
  
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
  
        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
        rotations = perf_rotate_context(cpuctx);
@@@ -1093,7 -1130,7 +1130,7 @@@ static void perf_event_ctx_activate(str
  {
        struct list_head *head = this_cpu_ptr(&active_ctx_list);
  
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
  
        WARN_ON(!list_empty(&ctx->active_ctx_list));
  
  
  static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
  {
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
  
        WARN_ON(list_empty(&ctx->active_ctx_list));
  
@@@ -1202,7 -1239,7 +1239,7 @@@ perf_event_ctx_lock_nested(struct perf_
  
  again:
        rcu_read_lock();
-       ctx = ACCESS_ONCE(event->ctx);
+       ctx = READ_ONCE(event->ctx);
        if (!atomic_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
@@@ -1398,60 -1435,6 +1435,6 @@@ static u64 perf_event_time(struct perf_
        return ctx ? ctx->time : 0;
  }
  
- /*
-  * Update the total_time_enabled and total_time_running fields for a event.
-  */
- static void update_event_times(struct perf_event *event)
- {
-       struct perf_event_context *ctx = event->ctx;
-       u64 run_end;
-       lockdep_assert_held(&ctx->lock);
-       if (event->state < PERF_EVENT_STATE_INACTIVE ||
-           event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-               return;
-       /*
-        * in cgroup mode, time_enabled represents
-        * the time the event was enabled AND active
-        * tasks were in the monitored cgroup. This is
-        * independent of the activity of the context as
-        * there may be a mix of cgroup and non-cgroup events.
-        *
-        * That is why we treat cgroup events differently
-        * here.
-        */
-       if (is_cgroup_event(event))
-               run_end = perf_cgroup_event_time(event);
-       else if (ctx->is_active)
-               run_end = ctx->time;
-       else
-               run_end = event->tstamp_stopped;
-       event->total_time_enabled = run_end - event->tstamp_enabled;
-       if (event->state == PERF_EVENT_STATE_INACTIVE)
-               run_end = event->tstamp_stopped;
-       else
-               run_end = perf_event_time(event);
-       event->total_time_running = run_end - event->tstamp_running;
- }
- /*
-  * Update total_time_enabled and total_time_running for all events in a group.
-  */
- static void update_group_times(struct perf_event *leader)
- {
-       struct perf_event *event;
-       update_event_times(leader);
-       list_for_each_entry(event, &leader->sibling_list, group_entry)
-               update_event_times(event);
- }
  static enum event_type_t get_event_type(struct perf_event *event)
  {
        struct perf_event_context *ctx = event->ctx;
@@@ -1494,6 -1477,8 +1477,8 @@@ list_add_event(struct perf_event *event
        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;
  
+       event->tstamp = perf_event_time(event);
        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
@@@ -1701,8 -1686,6 +1686,6 @@@ list_del_event(struct perf_event *event
        if (event->group_leader == event)
                list_del_init(&event->group_entry);
  
-       update_group_times(event);
        /*
         * If event was in error state, then keep it
         * that way, otherwise bogus counts will be
         * of the event
         */
        if (event->state > PERF_EVENT_STATE_OFF)
-               event->state = PERF_EVENT_STATE_OFF;
+               perf_event_set_state(event, PERF_EVENT_STATE_OFF);
  
        ctx->generation++;
  }
@@@ -1810,38 -1793,24 +1793,24 @@@ event_sched_out(struct perf_event *even
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
  {
-       u64 tstamp = perf_event_time(event);
-       u64 delta;
+       enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
  
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);
  
-       /*
-        * An event which could not be activated because of
-        * filter mismatch still needs to have its timings
-        * maintained, otherwise bogus information is return
-        * via read() for time_enabled, time_running:
-        */
-       if (event->state == PERF_EVENT_STATE_INACTIVE &&
-           !event_filter_match(event)) {
-               delta = tstamp - event->tstamp_stopped;
-               event->tstamp_running += delta;
-               event->tstamp_stopped = tstamp;
-       }
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
  
        perf_pmu_disable(event->pmu);
  
-       event->tstamp_stopped = tstamp;
        event->pmu->del(event, 0);
        event->oncpu = -1;
-       event->state = PERF_EVENT_STATE_INACTIVE;
        if (event->pending_disable) {
                event->pending_disable = 0;
-               event->state = PERF_EVENT_STATE_OFF;
+               state = PERF_EVENT_STATE_OFF;
        }
+       perf_event_set_state(event, state);
  
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
@@@ -1861,7 -1830,9 +1830,9 @@@ group_sched_out(struct perf_event *grou
                struct perf_event_context *ctx)
  {
        struct perf_event *event;
-       int state = group_event->state;
+       if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+               return;
  
        perf_pmu_disable(ctx->pmu);
  
  
        perf_pmu_enable(ctx->pmu);
  
-       if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+       if (group_event->attr.exclusive)
                cpuctx->exclusive = 0;
  }
  
@@@ -1895,6 -1866,11 +1866,11 @@@ __perf_remove_from_context(struct perf_
  {
        unsigned long flags = (unsigned long)info;
  
+       if (ctx->is_active & EVENT_TIME) {
+               update_context_time(ctx);
+               update_cgrp_time_from_cpuctx(cpuctx);
+       }
        event_sched_out(event, cpuctx, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
@@@ -1957,14 -1933,17 +1933,17 @@@ static void __perf_event_disable(struc
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;
  
-       update_context_time(ctx);
-       update_cgrp_time_from_event(event);
-       update_group_times(event);
+       if (ctx->is_active & EVENT_TIME) {
+               update_context_time(ctx);
+               update_cgrp_time_from_event(event);
+       }
        if (event == event->group_leader)
                group_sched_out(event, cpuctx, ctx);
        else
                event_sched_out(event, cpuctx, ctx);
-       event->state = PERF_EVENT_STATE_OFF;
+       perf_event_set_state(event, PERF_EVENT_STATE_OFF);
  }
  
  /*
@@@ -2021,8 -2000,7 +2000,7 @@@ void perf_event_disable_inatomic(struc
  }
  
  static void perf_set_shadow_time(struct perf_event *event,
-                                struct perf_event_context *ctx,
-                                u64 tstamp)
+                                struct perf_event_context *ctx)
  {
        /*
         * use the correct time source for the time snapshot
         * is cleaner and simpler to understand.
         */
        if (is_cgroup_event(event))
-               perf_cgroup_set_shadow_time(event, tstamp);
+               perf_cgroup_set_shadow_time(event, event->tstamp);
        else
-               event->shadow_ctx_time = tstamp - ctx->timestamp;
+               event->shadow_ctx_time = event->tstamp - ctx->timestamp;
  }
  
  #define MAX_INTERRUPTS (~0ULL)
@@@ -2065,7 -2043,6 +2043,6 @@@ event_sched_in(struct perf_event *event
                 struct perf_cpu_context *cpuctx,
                 struct perf_event_context *ctx)
  {
-       u64 tstamp = perf_event_time(event);
        int ret = 0;
  
        lockdep_assert_held(&ctx->lock);
  
        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
-        * Order event::oncpu write to happen before the ACTIVE state
-        * is visible.
+        * Order event::oncpu write to happen before the ACTIVE state is
+        * visible. This allows perf_event_{stop,read}() to observe the correct
+        * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
-       WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+       perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
  
        /*
         * Unthrottle events, since we scheduled we might have missed several
                event->hw.interrupts = 0;
        }
  
-       /*
-        * The new state must be visible before we turn it on in the hardware:
-        */
-       smp_wmb();
        perf_pmu_disable(event->pmu);
  
-       perf_set_shadow_time(event, ctx, tstamp);
+       perf_set_shadow_time(event, ctx);
  
        perf_log_itrace_start(event);
  
        if (event->pmu->add(event, PERF_EF_START)) {
-               event->state = PERF_EVENT_STATE_INACTIVE;
+               perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }
  
-       event->tstamp_running += tstamp - event->tstamp_stopped;
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        if (!ctx->nr_active++)
@@@ -2134,8 -2105,6 +2105,6 @@@ group_sched_in(struct perf_event *group
  {
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = ctx->pmu;
-       u64 now = ctx->time;
-       bool simulate = false;
  
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
@@@ -2165,27 -2134,13 +2134,13 @@@ group_error
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
-        * The events up to the failed event are scheduled out normally,
-        * tstamp_stopped will be updated.
-        *
-        * The failed events and the remaining siblings need to have
-        * their timings updated as if they had gone thru event_sched_in()
-        * and event_sched_out(). This is required to get consistent timings
-        * across the group. This also takes care of the case where the group
-        * could never be scheduled by ensuring tstamp_stopped is set to mark
-        * the time the event was actually stopped, such that time delta
-        * calculation in update_event_times() is correct.
+        * The events up to the failed event are scheduled out normally.
         */
        list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                if (event == partial_group)
-                       simulate = true;
+                       break;
  
-               if (simulate) {
-                       event->tstamp_running += now - event->tstamp_stopped;
-                       event->tstamp_stopped = now;
-               } else {
-                       event_sched_out(event, cpuctx, ctx);
-               }
+               event_sched_out(event, cpuctx, ctx);
        }
        event_sched_out(group_event, cpuctx, ctx);
  
@@@ -2227,46 -2182,11 +2182,11 @@@ static int group_can_go_on(struct perf_
        return can_add_hw;
  }
  
- /*
-  * Complement to update_event_times(). This computes the tstamp_* values to
-  * continue 'enabled' state from @now, and effectively discards the time
-  * between the prior tstamp_stopped and now (as we were in the OFF state, or
-  * just switched (context) time base).
-  *
-  * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
-  * cannot have been scheduled in yet. And going into INACTIVE state means
-  * '@event->tstamp_stopped = @now'.
-  *
-  * Thus given the rules of update_event_times():
-  *
-  *   total_time_enabled = tstamp_stopped - tstamp_enabled
-  *   total_time_running = tstamp_stopped - tstamp_running
-  *
-  * We can insert 'tstamp_stopped == now' and reverse them to compute new
-  * tstamp_* values.
-  */
- static void __perf_event_enable_time(struct perf_event *event, u64 now)
- {
-       WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-       event->tstamp_stopped = now;
-       event->tstamp_enabled = now - event->total_time_enabled;
-       event->tstamp_running = now - event->total_time_running;
- }
  static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
  {
-       u64 tstamp = perf_event_time(event);
        list_add_event(event, ctx);
        perf_group_attach(event);
-       /*
-        * We can be called with event->state == STATE_OFF when we create with
-        * .disabled = 1. In that case the IOC_ENABLE will call this function.
-        */
-       if (event->state == PERF_EVENT_STATE_INACTIVE)
-               __perf_event_enable_time(event, tstamp);
  }
  
  static void ctx_sched_out(struct perf_event_context *ctx,
@@@ -2497,28 -2417,6 +2417,6 @@@ again
        raw_spin_unlock_irq(&ctx->lock);
  }
  
- /*
-  * Put a event into inactive state and update time fields.
-  * Enabling the leader of a group effectively enables all
-  * the group members that aren't explicitly disabled, so we
-  * have to update their ->tstamp_enabled also.
-  * Note: this works for group members as well as group leaders
-  * since the non-leader members' sibling_lists will be empty.
-  */
- static void __perf_event_mark_enabled(struct perf_event *event)
- {
-       struct perf_event *sub;
-       u64 tstamp = perf_event_time(event);
-       event->state = PERF_EVENT_STATE_INACTIVE;
-       __perf_event_enable_time(event, tstamp);
-       list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               /* XXX should not be > INACTIVE if event isn't */
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                       __perf_event_enable_time(sub, tstamp);
-       }
- }
  /*
   * Cross CPU call to enable a performance event
   */
@@@ -2537,14 -2435,12 +2435,12 @@@ static void __perf_event_enable(struct 
        if (ctx->is_active)
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
  
-       __perf_event_mark_enabled(event);
+       perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
  
        if (!ctx->is_active)
                return;
  
        if (!event_filter_match(event)) {
-               if (is_cgroup_event(event))
-                       perf_cgroup_defer_enabled(event);
                ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                return;
        }
@@@ -2864,18 -2760,10 +2760,10 @@@ static void __perf_event_sync_stat(stru
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
-       switch (event->state) {
-       case PERF_EVENT_STATE_ACTIVE:
+       if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);
-               /* fall-through */
-       case PERF_EVENT_STATE_INACTIVE:
-               update_event_times(event);
-               break;
  
-       default:
-               break;
-       }
+       perf_event_update_time(event);
  
        /*
         * In order to keep per-task stats reliable we need to flip the event
@@@ -3112,10 -3000,6 +3000,6 @@@ ctx_pinned_sched_in(struct perf_event_c
                if (!event_filter_match(event))
                        continue;
  
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, 1))
                        group_sched_in(event, cpuctx, ctx);
  
                 * If this pinned group hasn't been scheduled,
                 * put it in error state.
                 */
-               if (event->state == PERF_EVENT_STATE_INACTIVE) {
-                       update_group_times(event);
-                       event->state = PERF_EVENT_STATE_ERROR;
-               }
+               if (event->state == PERF_EVENT_STATE_INACTIVE)
+                       perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
  }
  
@@@ -3148,10 -3030,6 +3030,6 @@@ ctx_flexible_sched_in(struct perf_event
                if (!event_filter_match(event))
                        continue;
  
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
                if (group_can_go_on(event, cpuctx, can_add_hw)) {
                        if (group_sched_in(event, cpuctx, ctx))
                                can_add_hw = 0;
@@@ -3523,7 -3401,7 +3401,7 @@@ void perf_event_task_tick(void
        struct perf_event_context *ctx, *tmp;
        int throttled;
  
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
  
        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
@@@ -3543,7 -3421,7 +3421,7 @@@ static int event_enable_on_exec(struct 
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;
  
-       __perf_event_mark_enabled(event);
+       perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
  
        return 1;
  }
@@@ -3637,12 -3515,15 +3515,15 @@@ static void __perf_event_read(void *inf
                return;
  
        raw_spin_lock(&ctx->lock);
-       if (ctx->is_active) {
+       if (ctx->is_active & EVENT_TIME) {
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
  
-       update_event_times(event);
+       perf_event_update_time(event);
+       if (data->group)
+               perf_event_update_sibling_time(event);
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;
  
        pmu->read(event);
  
        list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               update_event_times(sub);
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
@@@ -3686,7 -3566,8 +3566,8 @@@ static inline u64 perf_event_count(stru
   *     will not be local and we cannot read them atomically
   *   - must not have a pmu::count method
   */
- int perf_event_read_local(struct perf_event *event, u64 *value)
+ int perf_event_read_local(struct perf_event *event, u64 *value,
+                         u64 *enabled, u64 *running)
  {
        unsigned long flags;
        int ret = 0;
                goto out;
        }
  
        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
                event->pmu->read(event);
  
        *value = local64_read(&event->count);
+       if (enabled || running) {
+               u64 now = event->shadow_ctx_time + perf_clock();
+               u64 __enabled, __running;
+               __perf_update_times(event, now, &__enabled, &__running);
+               if (enabled)
+                       *enabled = __enabled;
+               if (running)
+                       *running = __running;
+       }
  out:
        local_irq_restore(flags);
  
  
  static int perf_event_read(struct perf_event *event, bool group)
  {
+       enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;
  
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
-       if (event->state == PERF_EVENT_STATE_ACTIVE) {
-               struct perf_read_data data = {
-                       .event = event,
-                       .group = group,
-                       .ret = 0,
-               };
+ again:
+       if (state == PERF_EVENT_STATE_ACTIVE) {
+               struct perf_read_data data;
+               /*
+                * Orders the ->state and ->oncpu loads such that if we see
+                * ACTIVE we must also see the right ->oncpu.
+                *
+                * Matches the smp_wmb() from event_sched_in().
+                */
+               smp_rmb();
  
                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;
  
+               data = (struct perf_read_data){
+                       .event = event,
+                       .group = group,
+                       .ret = 0,
+               };
                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);
  
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;
-       } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+       } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
  
                raw_spin_lock_irqsave(&ctx->lock, flags);
+               state = event->state;
+               if (state != PERF_EVENT_STATE_INACTIVE) {
+                       raw_spin_unlock_irqrestore(&ctx->lock, flags);
+                       goto again;
+               }
                /*
-                * may read while context is not active
-                * (e.g., thread is blocked), in that case
-                * we cannot update context time
+                * May read while context is not active (e.g., thread is
+                * blocked), in that case we cannot update context time
                 */
-               if (ctx->is_active) {
+               if (ctx->is_active & EVENT_TIME) {
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
+               perf_event_update_time(event);
                if (group)
-                       update_group_times(event);
-               else
-                       update_event_times(event);
+                       perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
  
@@@ -4233,7 -4143,7 +4143,7 @@@ static void perf_remove_from_owner(stru
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
-       owner = lockless_dereference(event->owner);
+       owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
@@@ -4330,7 -4240,7 +4240,7 @@@ again
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
-               ctx = lockless_dereference(child->ctx);
+               ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
@@@ -4390,7 -4300,7 +4300,7 @@@ static int perf_release(struct inode *i
        return 0;
  }
  
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
  {
        struct perf_event *child;
        u64 total = 0;
  
        return total;
  }
+ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+ {
+       struct perf_event_context *ctx;
+       u64 count;
+       ctx = perf_event_ctx_lock(event);
+       count = __perf_event_read_value(event, enabled, running);
+       perf_event_ctx_unlock(event, ctx);
+       return count;
+ }
  EXPORT_SYMBOL_GPL(perf_event_read_value);
  
  static int __perf_read_group_add(struct perf_event *leader,
        if (ret)
                return ret;
  
+       raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
-       raw_spin_lock_irqsave(&ctx->lock, flags);
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
@@@ -4520,7 -4442,7 +4442,7 @@@ static int perf_read_one(struct perf_ev
        u64 values[4];
        int n = 0;
  
-       values[n++] = perf_event_read_value(event, &enabled, &running);
+       values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
@@@ -4899,8 -4821,7 +4821,7 @@@ static void calc_timer_values(struct pe
  
        *now = perf_clock();
        ctx_time = event->shadow_ctx_time + *now;
-       *enabled = ctx_time - event->tstamp_enabled;
-       *running = ctx_time - event->tstamp_running;
+       __perf_update_times(event, ctx_time, enabled, running);
  }
  
  static void perf_event_init_userpage(struct perf_event *event)
@@@ -5304,8 -5225,8 +5225,8 @@@ static int perf_mmap(struct file *file
                if (!rb)
                        goto aux_unlock;
  
-               aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
-               aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+               aux_offset = READ_ONCE(rb->user_page->aux_offset);
+               aux_size = READ_ONCE(rb->user_page->aux_size);
  
                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                        goto aux_unlock;
@@@ -6756,7 -6677,6 +6677,7 @@@ static void perf_fill_ns_link_info(stru
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
 +              path_put(&ns_path);
        }
  }
  
@@@ -8075,6 -7995,7 +7996,7 @@@ static void bpf_overflow_handler(struc
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .regs = regs,
+               .event = event,
        };
        int ret = 0;
  
@@@ -9405,6 -9326,11 +9327,11 @@@ static void account_event(struct perf_e
                inc = true;
  
        if (inc) {
+               /*
+                * We need the mutex here because static_branch_enable()
+                * must complete *before* the perf_sched_count increment
+                * becomes visible.
+                */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;
  
@@@ -10530,7 -10456,7 +10457,7 @@@ perf_event_exit_event(struct perf_even
        if (parent_event)
                perf_group_detach(child_event);
        list_del_event(child_event, child_ctx);
-       child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+       perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
        raw_spin_unlock_irq(&child_ctx->lock);
  
        /*
@@@ -10768,7 -10694,7 +10695,7 @@@ inherit_event(struct perf_event *parent
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
  {
-       enum perf_event_active_state parent_state = parent_event->state;
+       enum perf_event_state parent_state = parent_event->state;
        struct perf_event *child_event;
        unsigned long flags;
  
@@@ -11104,6 -11030,7 +11031,7 @@@ static void __perf_event_exit_context(v
        struct perf_event *event;
  
        raw_spin_lock(&ctx->lock);
+       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);