Merge branch 'linus' into perf/urgent, to pick up dependent commits

author Ingo Molnar <mingo@kernel.org>

Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
author Ingo Molnar <mingo@kernel.org>
Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
diff --combined arch/x86/events/intel/core.c

index f94855000d4ed02677ae5a46a3e622bf54a5f0b2,43445da30ceab12323772e81c95f0dfb3ba8cfa3..09c26a4f139c125e000675689ebc983acd8ab91a
--- 1/arch/x86/events/intel/core.c
--- 2/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@@ -2958,6 -2958,10 +2958,10 @@@ static unsigned long intel_pmu_free_run
   
         if (event->attr.use_clockid)
                 flags &= ~PERF_SAMPLE_TIME;
+       if (!event->attr.exclude_kernel)
+               flags &= ~PERF_SAMPLE_REGS_USER;
+       if (event->attr.sample_regs_user & ~PEBS_REGS)
+               flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR);
         return flags;
   }
   
@@@ -3730,19 -3734,6 +3734,19 @@@ EVENT_ATTR_STR(cycles-t,      cycles_t,       "eve
   EVENT_ATTR_STR(cycles-ct,     cycles_ct,      "event=0x3c,in_tx=1,in_tx_cp=1");
   
   static struct attribute *hsw_events_attrs[] = {
+ +      EVENT_PTR(mem_ld_hsw),
+ +      EVENT_PTR(mem_st_hsw),
+ +      EVENT_PTR(td_slots_issued),
+ +      EVENT_PTR(td_slots_retired),
+ +      EVENT_PTR(td_fetch_bubbles),
+ +      EVENT_PTR(td_total_slots),
+ +      EVENT_PTR(td_total_slots_scale),
+ +      EVENT_PTR(td_recovery_bubbles),
+ +      EVENT_PTR(td_recovery_bubbles_scale),
+ +      NULL
+ +};
+ +
+ +static struct attribute *hsw_tsx_events_attrs[] = {
         EVENT_PTR(tx_start),
         EVENT_PTR(tx_commit),
         EVENT_PTR(tx_abort),
@@@ -3755,16 -3746,18 +3759,16 @@@
         EVENT_PTR(el_conflict),
         EVENT_PTR(cycles_t),
         EVENT_PTR(cycles_ct),
- -      EVENT_PTR(mem_ld_hsw),
- -      EVENT_PTR(mem_st_hsw),
- -      EVENT_PTR(td_slots_issued),
- -      EVENT_PTR(td_slots_retired),
- -      EVENT_PTR(td_fetch_bubbles),
- -      EVENT_PTR(td_total_slots),
- -      EVENT_PTR(td_total_slots_scale),
- -      EVENT_PTR(td_recovery_bubbles),
- -      EVENT_PTR(td_recovery_bubbles_scale),
         NULL
   };
   
+ +static __init struct attribute **get_hsw_events_attrs(void)
+ +{
+ +      return boot_cpu_has(X86_FEATURE_RTM) ?
+ +              merge_attr(hsw_events_attrs, hsw_tsx_events_attrs) :
+ +              hsw_events_attrs;
+ +}
+ +
   static ssize_t freeze_on_smi_show(struct device *cdev,
                                   struct device_attribute *attr,
                                   char *buf)
@@@ -4193,7 -4186,7 +4197,7 @@@ __init int intel_pmu_init(void
   
                 x86_pmu.hw_config = hsw_hw_config;
                 x86_pmu.get_event_constraints = hsw_get_event_constraints;
- -              x86_pmu.cpu_events = hsw_events_attrs;
+ +              x86_pmu.cpu_events = get_hsw_events_attrs();
                 x86_pmu.lbr_double_abort = true;
                 extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
                         hsw_format_attr : nhm_format_attr;
@@@ -4232,7 -4225,7 +4236,7 @@@
   
                 x86_pmu.hw_config = hsw_hw_config;
                 x86_pmu.get_event_constraints = hsw_get_event_constraints;
- -              x86_pmu.cpu_events = hsw_events_attrs;
+ +              x86_pmu.cpu_events = get_hsw_events_attrs();
                 x86_pmu.limit_period = bdw_limit_period;
                 extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
                         hsw_format_attr : nhm_format_attr;
@@@ -4290,7 -4283,7 +4294,7 @@@
                 extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
                         hsw_format_attr : nhm_format_attr;
                 extra_attr = merge_attr(extra_attr, skl_format_attr);
- -              x86_pmu.cpu_events = hsw_events_attrs;
+ +              x86_pmu.cpu_events = get_hsw_events_attrs();
                 intel_pmu_pebs_data_source_skl(
                         boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
                 pr_cont("Skylake events, ");
diff --combined kernel/events/core.c

index ab5ac84f82e20e60efdc55613680cefc4b562117,4c39c05e029a7f357cb872ffa982d6ca2381fddb..6ceb10d87462ee9521ddf11d4880f09775fa8ff0
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -209,7 -209,7 +209,7 @@@ static int event_function(void *info
         struct perf_event_context *task_ctx = cpuctx->task_ctx;
         int ret = 0;
   
-       WARN_ON_ONCE(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
   
         perf_ctx_lock(cpuctx, task_ctx);
         /*
@@@ -306,7 -306,7 +306,7 @@@ static void event_function_local(struc
         struct task_struct *task = READ_ONCE(ctx->task);
         struct perf_event_context *task_ctx = NULL;
   
-       WARN_ON_ONCE(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
   
         if (task) {
                 if (task == TASK_TOMBSTONE)
@@@ -582,6 -582,88 +582,88 @@@ static inline u64 perf_event_clock(stru
         return event->clock();
   }
   
+ /*
+  * State based event timekeeping...
+  *
+  * The basic idea is to use event->state to determine which (if any) time
+  * fields to increment with the current delta. This means we only need to
+  * update timestamps when we change state or when they are explicitly requested
+  * (read).
+  *
+  * Event groups make things a little more complicated, but not terribly so. The
+  * rules for a group are that if the group leader is OFF the entire group is
+  * OFF, irrespecive of what the group member states are. This results in
+  * __perf_effective_state().
+  *
+  * A futher ramification is that when a group leader flips between OFF and
+  * !OFF, we need to update all group member times.
+  *
+  *
+  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+  * need to make sure the relevant context time is updated before we try and
+  * update our timestamps.
+  */
+ 
+ static __always_inline enum perf_event_state
+ __perf_effective_state(struct perf_event *event)
+ {
+       struct perf_event *leader = event->group_leader;
+ 
+       if (leader->state <= PERF_EVENT_STATE_OFF)
+               return leader->state;
+ 
+       return event->state;
+ }
+ 
+ static __always_inline void
+ __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+ {
+       enum perf_event_state state = __perf_effective_state(event);
+       u64 delta = now - event->tstamp;
+ 
+       *enabled = event->total_time_enabled;
+       if (state >= PERF_EVENT_STATE_INACTIVE)
+               *enabled += delta;
+ 
+       *running = event->total_time_running;
+       if (state >= PERF_EVENT_STATE_ACTIVE)
+               *running += delta;
+ }
+ 
+ static void perf_event_update_time(struct perf_event *event)
+ {
+       u64 now = perf_event_time(event);
+ 
+       __perf_update_times(event, now, &event->total_time_enabled,
+                                       &event->total_time_running);
+       event->tstamp = now;
+ }
+ 
+ static void perf_event_update_sibling_time(struct perf_event *leader)
+ {
+       struct perf_event *sibling;
+ 
+       list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+               perf_event_update_time(sibling);
+ }
+ 
+ static void
+ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+ {
+       if (event->state == state)
+               return;
+ 
+       perf_event_update_time(event);
+       /*
+        * If a group leader gets enabled/disabled all its siblings
+        * are affected too.
+        */
+       if ((event->state < 0) ^ (state < 0))
+               perf_event_update_sibling_time(event);
+ 
+       WRITE_ONCE(event->state, state);
+ }
+ 
   #ifdef CONFIG_CGROUP_PERF
   
   static inline bool
@@@ -841,40 -923,6 +923,6 @@@ perf_cgroup_set_shadow_time(struct perf
         event->shadow_ctx_time = now - t->timestamp;
   }
   
- static inline void
- perf_cgroup_defer_enabled(struct perf_event *event)
- {
-       /*
-        * when the current task's perf cgroup does not match
-        * the event's, we need to remember to call the
-        * perf_mark_enable() function the first time a task with
-        * a matching perf cgroup is scheduled in.
-        */
-       if (is_cgroup_event(event) && !perf_cgroup_match(event))
-               event->cgrp_defer_enabled = 1;
- }
- 
- static inline void
- perf_cgroup_mark_enabled(struct perf_event *event,
-                        struct perf_event_context *ctx)
- {
-       struct perf_event *sub;
-       u64 tstamp = perf_event_time(event);
- 
-       if (!event->cgrp_defer_enabled)
-               return;
- 
-       event->cgrp_defer_enabled = 0;
- 
-       event->tstamp_enabled = tstamp - event->total_time_enabled;
-       list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-                       sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-                       sub->cgrp_defer_enabled = 0;
-               }
-       }
- }
- 
   /*
    * Update cpuctx->cgrp so that it is set when first cgroup event is added and
    * cleared when last cgroup event is removed.
@@@ -974,17 -1022,6 +1022,6 @@@ static inline u64 perf_cgroup_event_tim
         return 0;
   }
   
- static inline void
- perf_cgroup_defer_enabled(struct perf_event *event)
- {
- }
- 
- static inline void
- perf_cgroup_mark_enabled(struct perf_event *event,
-                        struct perf_event_context *ctx)
- {
- }
- 
   static inline void
   list_update_cgroup_event(struct perf_event *event,
                          struct perf_event_context *ctx, bool add)
@@@ -1006,7 -1043,7 +1043,7 @@@ static enum hrtimer_restart perf_mux_hr
         struct perf_cpu_context *cpuctx;
         int rotations = 0;
   
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
   
         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
         rotations = perf_rotate_context(cpuctx);
@@@ -1093,7 -1130,7 +1130,7 @@@ static void perf_event_ctx_activate(str
   {
         struct list_head *head = this_cpu_ptr(&active_ctx_list);
   
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
   
         WARN_ON(!list_empty(&ctx->active_ctx_list));
   
@@@ -1102,7 -1139,7 +1139,7 @@@
   
   static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
   {
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
   
         WARN_ON(list_empty(&ctx->active_ctx_list));
   
@@@ -1202,7 -1239,7 +1239,7 @@@ perf_event_ctx_lock_nested(struct perf_
   
   again:
         rcu_read_lock();
-       ctx = ACCESS_ONCE(event->ctx);
+       ctx = READ_ONCE(event->ctx);
         if (!atomic_inc_not_zero(&ctx->refcount)) {
                 rcu_read_unlock();
                 goto again;
@@@ -1398,60 -1435,6 +1435,6 @@@ static u64 perf_event_time(struct perf_
         return ctx ? ctx->time : 0;
   }
   
- /*
-  * Update the total_time_enabled and total_time_running fields for a event.
-  */
- static void update_event_times(struct perf_event *event)
- {
-       struct perf_event_context *ctx = event->ctx;
-       u64 run_end;
- 
-       lockdep_assert_held(&ctx->lock);
- 
-       if (event->state < PERF_EVENT_STATE_INACTIVE ||
-           event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-               return;
- 
-       /*
-        * in cgroup mode, time_enabled represents
-        * the time the event was enabled AND active
-        * tasks were in the monitored cgroup. This is
-        * independent of the activity of the context as
-        * there may be a mix of cgroup and non-cgroup events.
-        *
-        * That is why we treat cgroup events differently
-        * here.
-        */
-       if (is_cgroup_event(event))
-               run_end = perf_cgroup_event_time(event);
-       else if (ctx->is_active)
-               run_end = ctx->time;
-       else
-               run_end = event->tstamp_stopped;
- 
-       event->total_time_enabled = run_end - event->tstamp_enabled;
- 
-       if (event->state == PERF_EVENT_STATE_INACTIVE)
-               run_end = event->tstamp_stopped;
-       else
-               run_end = perf_event_time(event);
- 
-       event->total_time_running = run_end - event->tstamp_running;
- 
- }
- 
- /*
-  * Update total_time_enabled and total_time_running for all events in a group.
-  */
- static void update_group_times(struct perf_event *leader)
- {
-       struct perf_event *event;
- 
-       update_event_times(leader);
-       list_for_each_entry(event, &leader->sibling_list, group_entry)
-               update_event_times(event);
- }
- 
   static enum event_type_t get_event_type(struct perf_event *event)
   {
         struct perf_event_context *ctx = event->ctx;
@@@ -1494,6 -1477,8 +1477,8 @@@ list_add_event(struct perf_event *event
         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
         event->attach_state |= PERF_ATTACH_CONTEXT;
   
+       event->tstamp = perf_event_time(event);
+ 
         /*
          * If we're a stand alone event or group leader, we go to the context
          * list, group events are kept attached to the group so that
@@@ -1701,8 -1686,6 +1686,6 @@@ list_del_event(struct perf_event *event
         if (event->group_leader == event)
                 list_del_init(&event->group_entry);
   
-       update_group_times(event);
- 
         /*
          * If event was in error state, then keep it
          * that way, otherwise bogus counts will be
@@@ -1711,7 -1694,7 +1694,7 @@@
          * of the event
          */
         if (event->state > PERF_EVENT_STATE_OFF)
-               event->state = PERF_EVENT_STATE_OFF;
+               perf_event_set_state(event, PERF_EVENT_STATE_OFF);
   
         ctx->generation++;
   }
@@@ -1810,38 -1793,24 +1793,24 @@@ event_sched_out(struct perf_event *even
                   struct perf_cpu_context *cpuctx,
                   struct perf_event_context *ctx)
   {
-       u64 tstamp = perf_event_time(event);
-       u64 delta;
+       enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
   
         WARN_ON_ONCE(event->ctx != ctx);
         lockdep_assert_held(&ctx->lock);
   
-       /*
-        * An event which could not be activated because of
-        * filter mismatch still needs to have its timings
-        * maintained, otherwise bogus information is return
-        * via read() for time_enabled, time_running:
-        */
-       if (event->state == PERF_EVENT_STATE_INACTIVE &&
-           !event_filter_match(event)) {
-               delta = tstamp - event->tstamp_stopped;
-               event->tstamp_running += delta;
-               event->tstamp_stopped = tstamp;
-       }
- 
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return;
   
         perf_pmu_disable(event->pmu);
   
-       event->tstamp_stopped = tstamp;
         event->pmu->del(event, 0);
         event->oncpu = -1;
-       event->state = PERF_EVENT_STATE_INACTIVE;
+ 
         if (event->pending_disable) {
                 event->pending_disable = 0;
-               event->state = PERF_EVENT_STATE_OFF;
+               state = PERF_EVENT_STATE_OFF;
         }
+       perf_event_set_state(event, state);
   
         if (!is_software_event(event))
                 cpuctx->active_oncpu--;
@@@ -1861,7 -1830,9 +1830,9 @@@ group_sched_out(struct perf_event *grou
                 struct perf_event_context *ctx)
   {
         struct perf_event *event;
-       int state = group_event->state;
+ 
+       if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+               return;
   
         perf_pmu_disable(ctx->pmu);
   
@@@ -1875,7 -1846,7 +1846,7 @@@
   
         perf_pmu_enable(ctx->pmu);
   
-       if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+       if (group_event->attr.exclusive)
                 cpuctx->exclusive = 0;
   }
   
@@@ -1895,6 -1866,11 +1866,11 @@@ __perf_remove_from_context(struct perf_
   {
         unsigned long flags = (unsigned long)info;
   
+       if (ctx->is_active & EVENT_TIME) {
+               update_context_time(ctx);
+               update_cgrp_time_from_cpuctx(cpuctx);
+       }
+ 
         event_sched_out(event, cpuctx, ctx);
         if (flags & DETACH_GROUP)
                 perf_group_detach(event);
@@@ -1957,14 -1933,17 +1933,17 @@@ static void __perf_event_disable(struc
         if (event->state < PERF_EVENT_STATE_INACTIVE)
                 return;
   
-       update_context_time(ctx);
-       update_cgrp_time_from_event(event);
-       update_group_times(event);
+       if (ctx->is_active & EVENT_TIME) {
+               update_context_time(ctx);
+               update_cgrp_time_from_event(event);
+       }
+ 
         if (event == event->group_leader)
                 group_sched_out(event, cpuctx, ctx);
         else
                 event_sched_out(event, cpuctx, ctx);
-       event->state = PERF_EVENT_STATE_OFF;
+ 
+       perf_event_set_state(event, PERF_EVENT_STATE_OFF);
   }
   
   /*
@@@ -2021,8 -2000,7 +2000,7 @@@ void perf_event_disable_inatomic(struc
   }
   
   static void perf_set_shadow_time(struct perf_event *event,
-                                struct perf_event_context *ctx,
-                                u64 tstamp)
+                                struct perf_event_context *ctx)
   {
         /*
          * use the correct time source for the time snapshot
@@@ -2050,9 -2028,9 +2028,9 @@@
          * is cleaner and simpler to understand.
          */
         if (is_cgroup_event(event))
-               perf_cgroup_set_shadow_time(event, tstamp);
+               perf_cgroup_set_shadow_time(event, event->tstamp);
         else
-               event->shadow_ctx_time = tstamp - ctx->timestamp;
+               event->shadow_ctx_time = event->tstamp - ctx->timestamp;
   }
   
   #define MAX_INTERRUPTS (~0ULL)
@@@ -2065,7 -2043,6 +2043,6 @@@ event_sched_in(struct perf_event *event
                  struct perf_cpu_context *cpuctx,
                  struct perf_event_context *ctx)
   {
-       u64 tstamp = perf_event_time(event);
         int ret = 0;
   
         lockdep_assert_held(&ctx->lock);
@@@ -2075,11 -2052,12 +2052,12 @@@
   
         WRITE_ONCE(event->oncpu, smp_processor_id());
         /*
-        * Order event::oncpu write to happen before the ACTIVE state
-        * is visible.
+        * Order event::oncpu write to happen before the ACTIVE state is
+        * visible. This allows perf_event_{stop,read}() to observe the correct
+        * ->oncpu if it sees ACTIVE.
          */
         smp_wmb();
-       WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+       perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
   
         /*
          * Unthrottle events, since we scheduled we might have missed several
@@@ -2091,26 -2069,19 +2069,19 @@@
                 event->hw.interrupts = 0;
         }
   
-       /*
-        * The new state must be visible before we turn it on in the hardware:
-        */
-       smp_wmb();
- 
         perf_pmu_disable(event->pmu);
   
-       perf_set_shadow_time(event, ctx, tstamp);
+       perf_set_shadow_time(event, ctx);
   
         perf_log_itrace_start(event);
   
         if (event->pmu->add(event, PERF_EF_START)) {
-               event->state = PERF_EVENT_STATE_INACTIVE;
+               perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                 event->oncpu = -1;
                 ret = -EAGAIN;
                 goto out;
         }
   
-       event->tstamp_running += tstamp - event->tstamp_stopped;
- 
         if (!is_software_event(event))
                 cpuctx->active_oncpu++;
         if (!ctx->nr_active++)
@@@ -2134,8 -2105,6 +2105,6 @@@ group_sched_in(struct perf_event *group
   {
         struct perf_event *event, *partial_group = NULL;
         struct pmu *pmu = ctx->pmu;
-       u64 now = ctx->time;
-       bool simulate = false;
   
         if (group_event->state == PERF_EVENT_STATE_OFF)
                 return 0;
@@@ -2165,27 -2134,13 +2134,13 @@@ group_error
         /*
          * Groups can be scheduled in as one unit only, so undo any
          * partial group before returning:
-        * The events up to the failed event are scheduled out normally,
-        * tstamp_stopped will be updated.
-        *
-        * The failed events and the remaining siblings need to have
-        * their timings updated as if they had gone thru event_sched_in()
-        * and event_sched_out(). This is required to get consistent timings
-        * across the group. This also takes care of the case where the group
-        * could never be scheduled by ensuring tstamp_stopped is set to mark
-        * the time the event was actually stopped, such that time delta
-        * calculation in update_event_times() is correct.
+        * The events up to the failed event are scheduled out normally.
          */
         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
                 if (event == partial_group)
-                       simulate = true;
+                       break;
   
-               if (simulate) {
-                       event->tstamp_running += now - event->tstamp_stopped;
-                       event->tstamp_stopped = now;
-               } else {
-                       event_sched_out(event, cpuctx, ctx);
-               }
+               event_sched_out(event, cpuctx, ctx);
         }
         event_sched_out(group_event, cpuctx, ctx);
   
@@@ -2227,46 -2182,11 +2182,11 @@@ static int group_can_go_on(struct perf_
         return can_add_hw;
   }
   
- /*
-  * Complement to update_event_times(). This computes the tstamp_* values to
-  * continue 'enabled' state from @now, and effectively discards the time
-  * between the prior tstamp_stopped and now (as we were in the OFF state, or
-  * just switched (context) time base).
-  *
-  * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
-  * cannot have been scheduled in yet. And going into INACTIVE state means
-  * '@event->tstamp_stopped = @now'.
-  *
-  * Thus given the rules of update_event_times():
-  *
-  *   total_time_enabled = tstamp_stopped - tstamp_enabled
-  *   total_time_running = tstamp_stopped - tstamp_running
-  *
-  * We can insert 'tstamp_stopped == now' and reverse them to compute new
-  * tstamp_* values.
-  */
- static void __perf_event_enable_time(struct perf_event *event, u64 now)
- {
-       WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
- 
-       event->tstamp_stopped = now;
-       event->tstamp_enabled = now - event->total_time_enabled;
-       event->tstamp_running = now - event->total_time_running;
- }
- 
   static void add_event_to_ctx(struct perf_event *event,
                                struct perf_event_context *ctx)
   {
-       u64 tstamp = perf_event_time(event);
- 
         list_add_event(event, ctx);
         perf_group_attach(event);
-       /*
-        * We can be called with event->state == STATE_OFF when we create with
-        * .disabled = 1. In that case the IOC_ENABLE will call this function.
-        */
-       if (event->state == PERF_EVENT_STATE_INACTIVE)
-               __perf_event_enable_time(event, tstamp);
   }
   
   static void ctx_sched_out(struct perf_event_context *ctx,
@@@ -2497,28 -2417,6 +2417,6 @@@ again
         raw_spin_unlock_irq(&ctx->lock);
   }
   
- /*
-  * Put a event into inactive state and update time fields.
-  * Enabling the leader of a group effectively enables all
-  * the group members that aren't explicitly disabled, so we
-  * have to update their ->tstamp_enabled also.
-  * Note: this works for group members as well as group leaders
-  * since the non-leader members' sibling_lists will be empty.
-  */
- static void __perf_event_mark_enabled(struct perf_event *event)
- {
-       struct perf_event *sub;
-       u64 tstamp = perf_event_time(event);
- 
-       event->state = PERF_EVENT_STATE_INACTIVE;
-       __perf_event_enable_time(event, tstamp);
-       list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               /* XXX should not be > INACTIVE if event isn't */
-               if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-                       __perf_event_enable_time(sub, tstamp);
-       }
- }
- 
   /*
    * Cross CPU call to enable a performance event
    */
@@@ -2537,14 -2435,12 +2435,12 @@@ static void __perf_event_enable(struct 
         if (ctx->is_active)
                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
   
-       __perf_event_mark_enabled(event);
+       perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
   
         if (!ctx->is_active)
                 return;
   
         if (!event_filter_match(event)) {
-               if (is_cgroup_event(event))
-                       perf_cgroup_defer_enabled(event);
                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
                 return;
         }
@@@ -2864,18 -2760,10 +2760,10 @@@ static void __perf_event_sync_stat(stru
          * we know the event must be on the current CPU, therefore we
          * don't need to use it.
          */
-       switch (event->state) {
-       case PERF_EVENT_STATE_ACTIVE:
+       if (event->state == PERF_EVENT_STATE_ACTIVE)
                 event->pmu->read(event);
-               /* fall-through */
- 
-       case PERF_EVENT_STATE_INACTIVE:
-               update_event_times(event);
-               break;
   
-       default:
-               break;
-       }
+       perf_event_update_time(event);
   
         /*
          * In order to keep per-task stats reliable we need to flip the event
@@@ -3112,10 -3000,6 +3000,6 @@@ ctx_pinned_sched_in(struct perf_event_c
                 if (!event_filter_match(event))
                         continue;
   
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
- 
                 if (group_can_go_on(event, cpuctx, 1))
                         group_sched_in(event, cpuctx, ctx);
   
@@@ -3123,10 -3007,8 +3007,8 @@@
                  * If this pinned group hasn't been scheduled,
                  * put it in error state.
                  */
-               if (event->state == PERF_EVENT_STATE_INACTIVE) {
-                       update_group_times(event);
-                       event->state = PERF_EVENT_STATE_ERROR;
-               }
+               if (event->state == PERF_EVENT_STATE_INACTIVE)
+                       perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
         }
   }
   
@@@ -3148,10 -3030,6 +3030,6 @@@ ctx_flexible_sched_in(struct perf_event
                 if (!event_filter_match(event))
                         continue;
   
-               /* may need to reset tstamp_enabled */
-               if (is_cgroup_event(event))
-                       perf_cgroup_mark_enabled(event, ctx);
- 
                 if (group_can_go_on(event, cpuctx, can_add_hw)) {
                         if (group_sched_in(event, cpuctx, ctx))
                                 can_add_hw = 0;
@@@ -3523,7 -3401,7 +3401,7 @@@ void perf_event_task_tick(void
         struct perf_event_context *ctx, *tmp;
         int throttled;
   
-       WARN_ON(!irqs_disabled());
+       lockdep_assert_irqs_disabled();
   
         __this_cpu_inc(perf_throttled_seq);
         throttled = __this_cpu_xchg(perf_throttled_count, 0);
@@@ -3543,7 -3421,7 +3421,7 @@@ static int event_enable_on_exec(struct 
         if (event->state >= PERF_EVENT_STATE_INACTIVE)
                 return 0;
   
-       __perf_event_mark_enabled(event);
+       perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
   
         return 1;
   }
@@@ -3637,12 -3515,15 +3515,15 @@@ static void __perf_event_read(void *inf
                 return;
   
         raw_spin_lock(&ctx->lock);
-       if (ctx->is_active) {
+       if (ctx->is_active & EVENT_TIME) {
                 update_context_time(ctx);
                 update_cgrp_time_from_event(event);
         }
   
-       update_event_times(event);
+       perf_event_update_time(event);
+       if (data->group)
+               perf_event_update_sibling_time(event);
+ 
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 goto unlock;
   
@@@ -3657,7 -3538,6 +3538,6 @@@
         pmu->read(event);
   
         list_for_each_entry(sub, &event->sibling_list, group_entry) {
-               update_event_times(sub);
                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                         /*
                          * Use sibling's PMU rather than @event's since
@@@ -3686,7 -3566,8 +3566,8 @@@ static inline u64 perf_event_count(stru
    *     will not be local and we cannot read them atomically
    *   - must not have a pmu::count method
    */
- int perf_event_read_local(struct perf_event *event, u64 *value)
+ int perf_event_read_local(struct perf_event *event, u64 *value,
+                         u64 *enabled, u64 *running)
   {
         unsigned long flags;
         int ret = 0;
@@@ -3720,6 -3601,7 +3601,7 @@@
                 goto out;
         }
   
+ 
         /*
          * If the event is currently on this CPU, its either a per-task event,
          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@@ -3729,6 -3611,16 +3611,16 @@@
                 event->pmu->read(event);
   
         *value = local64_read(&event->count);
+       if (enabled || running) {
+               u64 now = event->shadow_ctx_time + perf_clock();
+               u64 __enabled, __running;
+ 
+               __perf_update_times(event, now, &__enabled, &__running);
+               if (enabled)
+                       *enabled = __enabled;
+               if (running)
+                       *running = __running;
+       }
   out:
         local_irq_restore(flags);
   
@@@ -3737,23 -3629,35 +3629,35 @@@
   
   static int perf_event_read(struct perf_event *event, bool group)
   {
+       enum perf_event_state state = READ_ONCE(event->state);
         int event_cpu, ret = 0;
   
         /*
          * If event is enabled and currently active on a CPU, update the
          * value in the event structure:
          */
-       if (event->state == PERF_EVENT_STATE_ACTIVE) {
-               struct perf_read_data data = {
-                       .event = event,
-                       .group = group,
-                       .ret = 0,
-               };
+ again:
+       if (state == PERF_EVENT_STATE_ACTIVE) {
+               struct perf_read_data data;
+ 
+               /*
+                * Orders the ->state and ->oncpu loads such that if we see
+                * ACTIVE we must also see the right ->oncpu.
+                *
+                * Matches the smp_wmb() from event_sched_in().
+                */
+               smp_rmb();
   
                 event_cpu = READ_ONCE(event->oncpu);
                 if ((unsigned)event_cpu >= nr_cpu_ids)
                         return 0;
   
+               data = (struct perf_read_data){
+                       .event = event,
+                       .group = group,
+                       .ret = 0,
+               };
+ 
                 preempt_disable();
                 event_cpu = __perf_event_read_cpu(event, event_cpu);
   
@@@ -3770,24 -3674,30 +3674,30 @@@
                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                 preempt_enable();
                 ret = data.ret;
-       } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ 
+       } else if (state == PERF_EVENT_STATE_INACTIVE) {
                 struct perf_event_context *ctx = event->ctx;
                 unsigned long flags;
   
                 raw_spin_lock_irqsave(&ctx->lock, flags);
+               state = event->state;
+               if (state != PERF_EVENT_STATE_INACTIVE) {
+                       raw_spin_unlock_irqrestore(&ctx->lock, flags);
+                       goto again;
+               }
+ 
                 /*
-                * may read while context is not active
-                * (e.g., thread is blocked), in that case
-                * we cannot update context time
+                * May read while context is not active (e.g., thread is
+                * blocked), in that case we cannot update context time
                  */
-               if (ctx->is_active) {
+               if (ctx->is_active & EVENT_TIME) {
                         update_context_time(ctx);
                         update_cgrp_time_from_event(event);
                 }
+ 
+               perf_event_update_time(event);
                 if (group)
-                       update_group_times(event);
-               else
-                       update_event_times(event);
+                       perf_event_update_sibling_time(event);
                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
         }
   
@@@ -4233,7 -4143,7 +4143,7 @@@ static void perf_remove_from_owner(stru
          * indeed free this event, otherwise we need to serialize on
          * owner->perf_event_mutex.
          */
-       owner = lockless_dereference(event->owner);
+       owner = READ_ONCE(event->owner);
         if (owner) {
                 /*
                  * Since delayed_put_task_struct() also drops the last
@@@ -4330,7 -4240,7 +4240,7 @@@ again
                  * Cannot change, child events are not migrated, see the
                  * comment with perf_event_ctx_lock_nested().
                  */
-               ctx = lockless_dereference(child->ctx);
+               ctx = READ_ONCE(child->ctx);
                 /*
                  * Since child_mutex nests inside ctx::mutex, we must jump
                  * through hoops. We start by grabbing a reference on the ctx.
@@@ -4390,7 -4300,7 +4300,7 @@@ static int perf_release(struct inode *i
         return 0;
   }
   
- u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
   {
         struct perf_event *child;
         u64 total = 0;
@@@ -4418,6 -4328,18 +4328,18 @@@
   
         return total;
   }
+ 
+ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+ {
+       struct perf_event_context *ctx;
+       u64 count;
+ 
+       ctx = perf_event_ctx_lock(event);
+       count = __perf_event_read_value(event, enabled, running);
+       perf_event_ctx_unlock(event, ctx);
+ 
+       return count;
+ }
   EXPORT_SYMBOL_GPL(perf_event_read_value);
   
   static int __perf_read_group_add(struct perf_event *leader,
@@@ -4433,6 -4355,8 +4355,8 @@@
         if (ret)
                 return ret;
   
+       raw_spin_lock_irqsave(&ctx->lock, flags);
+ 
         /*
          * Since we co-schedule groups, {enabled,running} times of siblings
          * will be identical to those of the leader, so we only publish one
@@@ -4455,8 -4379,6 +4379,6 @@@
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
   
-       raw_spin_lock_irqsave(&ctx->lock, flags);
- 
         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                 values[n++] += perf_event_count(sub);
                 if (read_format & PERF_FORMAT_ID)
@@@ -4520,7 -4442,7 +4442,7 @@@ static int perf_read_one(struct perf_ev
         u64 values[4];
         int n = 0;
   
-       values[n++] = perf_event_read_value(event, &enabled, &running);
+       values[n++] = __perf_event_read_value(event, &enabled, &running);
         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                 values[n++] = enabled;
         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
@@@ -4899,8 -4821,7 +4821,7 @@@ static void calc_timer_values(struct pe
   
         *now = perf_clock();
         ctx_time = event->shadow_ctx_time + *now;
-       *enabled = ctx_time - event->tstamp_enabled;
-       *running = ctx_time - event->tstamp_running;
+       __perf_update_times(event, ctx_time, enabled, running);
   }
   
   static void perf_event_init_userpage(struct perf_event *event)
@@@ -5304,8 -5225,8 +5225,8 @@@ static int perf_mmap(struct file *file
                 if (!rb)
                         goto aux_unlock;
   
-               aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
-               aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+               aux_offset = READ_ONCE(rb->user_page->aux_offset);
+               aux_size = READ_ONCE(rb->user_page->aux_size);
   
                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                         goto aux_unlock;
@@@ -6756,7 -6677,6 +6677,7 @@@ static void perf_fill_ns_link_info(stru
                 ns_inode = ns_path.dentry->d_inode;
                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                 ns_link_info->ino = ns_inode->i_ino;
+ +              path_put(&ns_path);
         }
   }
   
@@@ -8075,6 -7995,7 +7996,7 @@@ static void bpf_overflow_handler(struc
         struct bpf_perf_event_data_kern ctx = {
                 .data = data,
                 .regs = regs,
+               .event = event,
         };
         int ret = 0;
   
@@@ -9405,6 -9326,11 +9327,11 @@@ static void account_event(struct perf_e
                 inc = true;
   
         if (inc) {
+               /*
+                * We need the mutex here because static_branch_enable()
+                * must complete *before* the perf_sched_count increment
+                * becomes visible.
+                */
                 if (atomic_inc_not_zero(&perf_sched_count))
                         goto enabled;
   
@@@ -10530,7 -10456,7 +10457,7 @@@ perf_event_exit_event(struct perf_even
         if (parent_event)
                 perf_group_detach(child_event);
         list_del_event(child_event, child_ctx);
-       child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+       perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
         raw_spin_unlock_irq(&child_ctx->lock);
   
         /*
@@@ -10768,7 -10694,7 +10695,7 @@@ inherit_event(struct perf_event *parent
               struct perf_event *group_leader,
               struct perf_event_context *child_ctx)
   {
-       enum perf_event_active_state parent_state = parent_event->state;
+       enum perf_event_state parent_state = parent_event->state;
         struct perf_event *child_event;
         unsigned long flags;
   
@@@ -11104,6 -11030,7 +11031,7 @@@ static void __perf_event_exit_context(v
         struct perf_event *event;
   
         raw_spin_lock(&ctx->lock);
+       ctx_sched_out(ctx, cpuctx, EVENT_TIME);
         list_for_each_entry(event, &ctx->event_list, event_entry)
                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
         raw_spin_unlock(&ctx->lock);
author	Ingo Molnar <mingo@kernel.org>
	Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 29 Nov 2017 06:11:24 +0000 (07:11 +0100)
		1	2
arch/x86/events/intel/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history