Merge branch 'for-3.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 21 Mar 2012 01:11:21 +0000 (18:11 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 21 Mar 2012 01:11:21 +0000 (18:11 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 21 Mar 2012 01:11:21 +0000 (18:11 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 21 Mar 2012 01:11:21 +0000 (18:11 -0700)
diff --combined block/blk-cgroup.c

index 75642a352a8f40595069f0833a8e7cef131eca02,1359d637831f51b3344557c0652fc7c9687d36c4..ea84a23d5e680d8acc8dfc67948e5872f0142ba7
--- 1/block/blk-cgroup.c
--- 2/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -28,13 -28,10 +28,10 @@@ static LIST_HEAD(blkio_list)
   struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
   EXPORT_SYMBOL_GPL(blkio_root_cgroup);
   
- static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
-                                                 struct cgroup *);
- static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
-                             struct cgroup_taskset *);
- static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
-                          struct cgroup_taskset *);
- static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
+ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *);
+ static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *);
+ static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *);
+ static void blkiocg_destroy(struct cgroup *);
   static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
   
   /* for encoding cft->private value on file */
@@@ -1548,7 -1545,7 +1545,7 @@@ static int blkiocg_populate(struct cgro
                                 ARRAY_SIZE(blkio_files));
   }
   
- static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+ static void blkiocg_destroy(struct cgroup *cgroup)
   {
         struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
         unsigned long flags;
@@@ -1598,8 -1595,7 +1595,7 @@@
                 kfree(blkcg);
   }
   
- static struct cgroup_subsys_state *
- blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+ static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup)
   {
         struct blkio_cgroup *blkcg;
         struct cgroup *parent = cgroup->parent;
@@@ -1628,8 -1624,7 +1624,7 @@@ done
    * of the main cic data structures.  For now we allow a task to change
    * its cgroup only if it's the only owner of its ioc.
    */
- static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                             struct cgroup_taskset *tset)
+ static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
   {
         struct task_struct *task;
         struct io_context *ioc;
@@@ -1648,8 -1643,7 +1643,7 @@@
         return ret;
   }
   
- static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                          struct cgroup_taskset *tset)
+ static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
   {
         struct task_struct *task;
         struct io_context *ioc;
@@@ -1659,7 -1653,7 +1653,7 @@@
                 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
                 if (ioc) {
                         ioc_cgroup_changed(ioc);
- -                      put_io_context(ioc, NULL);
+ +                      put_io_context(ioc);
                 }
         }
   }
diff --combined include/net/sock.h

index dcde2d9268cd109319f4631eba8b5b6c1af1d92e,705d1add19a13a6bfd666a13a10200aacc84e3b9..7ef5c58f3f49ab4ea0477d3b9fc0509b9f283eee
--- 1/include/net/sock.h
--- 2/include/net/sock.h
+++ b/include/net/sock.h
@@@ -55,7 -55,6 +55,7 @@@
   #include <linux/uaccess.h>
   #include <linux/memcontrol.h>
   #include <linux/res_counter.h>
+ +#include <linux/static_key.h>
   
   #include <linux/filter.h>
   #include <linux/rculist_nulls.h>
@@@ -69,7 -68,7 +69,7 @@@ struct cgroup
   struct cgroup_subsys;
   #ifdef CONFIG_NET
   int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss);
- void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss);
+ void mem_cgroup_sockets_destroy(struct cgroup *cgrp);
   #else
   static inline
   int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
@@@ -77,7 -76,7 +77,7 @@@
         return 0;
   }
   static inline
- void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
   {
   }
   #endif
@@@ -227,7 -226,6 +227,7 @@@ struct cg_proto
     *   @sk_ack_backlog: current listen backlog
     *   @sk_max_ack_backlog: listen backlog set in listen()
     *   @sk_priority: %SO_PRIORITY setting
+ +  *   @sk_cgrp_prioidx: socket group's priority map index
     *   @sk_type: socket type (%SOCK_STREAM, etc)
     *   @sk_protocol: which protocol this socket belongs in this network family
     *   @sk_peer_pid: &struct pid for this socket's peer
@@@ -871,8 -869,7 +871,7 @@@ struct proto 
          */
         int                     (*init_cgroup)(struct cgroup *cgrp,
                                                struct cgroup_subsys *ss);
-       void                    (*destroy_cgroup)(struct cgroup *cgrp,
-                                                 struct cgroup_subsys *ss);
+       void                    (*destroy_cgroup)(struct cgroup *cgrp);
         struct cg_proto         *(*proto_cgroup)(struct mem_cgroup *memcg);
   #endif
   };
@@@ -923,14 -920,14 +922,14 @@@ inline void sk_refcnt_debug_release(con
   #define sk_refcnt_debug_release(sk) do { } while (0)
   #endif /* SOCK_REFCNT_DEBUG */
   
- -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
- -extern struct jump_label_key memcg_socket_limit_enabled;
+ +#if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET)
+ +extern struct static_key memcg_socket_limit_enabled;
   static inline struct cg_proto *parent_cg_proto(struct proto *proto,
                                                struct cg_proto *cg_proto)
   {
         return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg));
   }
- -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled)
+ +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled)
   #else
   #define mem_cgroup_sockets_enabled 0
   static inline struct cg_proto *parent_cg_proto(struct proto *proto,
@@@ -1009,8 -1006,9 +1008,8 @@@ static inline void memcg_memory_allocat
         struct res_counter *fail;
         int ret;
   
- -      ret = res_counter_charge(prot->memory_allocated,
- -                               amt << PAGE_SHIFT, &fail);
- -
+ +      ret = res_counter_charge_nofail(prot->memory_allocated,
+ +                                      amt << PAGE_SHIFT, &fail);
         if (ret < 0)
                 *parent_status = OVER_LIMIT;
   }
@@@ -1054,11 -1052,12 +1053,11 @@@ sk_memory_allocated_add(struct sock *sk
   }
   
   static inline void
- -sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status)
+ +sk_memory_allocated_sub(struct sock *sk, int amt)
   {
         struct proto *prot = sk->sk_prot;
   
- -      if (mem_cgroup_sockets_enabled && sk->sk_cgrp &&
- -          parent_status != OVER_LIMIT) /* Otherwise was uncharged already */
+ +      if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
                 memcg_memory_allocated_sub(sk->sk_cgrp, amt);
   
         atomic_long_sub(amt, prot->memory_allocated);
diff --combined kernel/events/core.c

index c61234b1a988da657babcecda2f6a070bd9d522e,a5d1ee92b0d9d7ca91c1d655004f451a00adff3a..4b50357914fb437a30cd146e1bd33e1f2b43c449
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -118,13 -118,6 +118,13 @@@ static int cpu_function_call(int cpu, i
                        PERF_FLAG_FD_OUTPUT  |\
                        PERF_FLAG_PID_CGROUP)
   
+ +/*
+ + * branch priv levels that need permission checks
+ + */
+ +#define PERF_SAMPLE_BRANCH_PERM_PLM \
+ +      (PERF_SAMPLE_BRANCH_KERNEL |\
+ +       PERF_SAMPLE_BRANCH_HV)
+ +
   enum event_type_t {
         EVENT_FLEXIBLE = 0x1,
         EVENT_PINNED = 0x2,
@@@ -135,9 -128,8 +135,9 @@@
    * perf_sched_events : >0 events exist
    * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
    */
- -struct jump_label_key_deferred perf_sched_events __read_mostly;
+ +struct static_key_deferred perf_sched_events __read_mostly;
   static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+ +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
   
   static atomic_t nr_mmap_events __read_mostly;
   static atomic_t nr_comm_events __read_mostly;
@@@ -823,7 -815,7 +823,7 @@@ static void update_event_times(struct p
          * here.
          */
         if (is_cgroup_event(event))
- -              run_end = perf_event_time(event);
+ +              run_end = perf_cgroup_event_time(event);
         else if (ctx->is_active)
                 run_end = ctx->time;
         else
@@@ -889,9 -881,6 +889,9 @@@ list_add_event(struct perf_event *event
         if (is_cgroup_event(event))
                 ctx->nr_cgroups++;
   
+ +      if (has_branch_stack(event))
+ +              ctx->nr_branch_stack++;
+ +
         list_add_rcu(&event->event_entry, &ctx->event_list);
         if (!ctx->nr_events)
                 perf_pmu_rotate_start(ctx->pmu);
@@@ -1031,9 -1020,6 +1031,9 @@@ list_del_event(struct perf_event *event
                         cpuctx->cgrp = NULL;
         }
   
+ +      if (has_branch_stack(event))
+ +              ctx->nr_branch_stack--;
+ +
         ctx->nr_events--;
         if (event->attr.inherit_stat)
                 ctx->nr_stat--;
@@@ -2208,66 -2194,6 +2208,66 @@@ static void perf_event_context_sched_in
         perf_pmu_rotate_start(ctx->pmu);
   }
   
+ +/*
+ + * When sampling the branck stack in system-wide, it may be necessary
+ + * to flush the stack on context switch. This happens when the branch
+ + * stack does not tag its entries with the pid of the current task.
+ + * Otherwise it becomes impossible to associate a branch entry with a
+ + * task. This ambiguity is more likely to appear when the branch stack
+ + * supports priv level filtering and the user sets it to monitor only
+ + * at the user level (which could be a useful measurement in system-wide
+ + * mode). In that case, the risk is high of having a branch stack with
+ + * branch from multiple tasks. Flushing may mean dropping the existing
+ + * entries or stashing them somewhere in the PMU specific code layer.
+ + *
+ + * This function provides the context switch callback to the lower code
+ + * layer. It is invoked ONLY when there is at least one system-wide context
+ + * with at least one active event using taken branch sampling.
+ + */
+ +static void perf_branch_stack_sched_in(struct task_struct *prev,
+ +                                     struct task_struct *task)
+ +{
+ +      struct perf_cpu_context *cpuctx;
+ +      struct pmu *pmu;
+ +      unsigned long flags;
+ +
+ +      /* no need to flush branch stack if not changing task */
+ +      if (prev == task)
+ +              return;
+ +
+ +      local_irq_save(flags);
+ +
+ +      rcu_read_lock();
+ +
+ +      list_for_each_entry_rcu(pmu, &pmus, entry) {
+ +              cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ +
+ +              /*
+ +               * check if the context has at least one
+ +               * event using PERF_SAMPLE_BRANCH_STACK
+ +               */
+ +              if (cpuctx->ctx.nr_branch_stack > 0
+ +                  && pmu->flush_branch_stack) {
+ +
+ +                      pmu = cpuctx->ctx.pmu;
+ +
+ +                      perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ +
+ +                      perf_pmu_disable(pmu);
+ +
+ +                      pmu->flush_branch_stack();
+ +
+ +                      perf_pmu_enable(pmu);
+ +
+ +                      perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+ +              }
+ +      }
+ +
+ +      rcu_read_unlock();
+ +
+ +      local_irq_restore(flags);
+ +}
+ +
   /*
    * Called from scheduler to add the events of the current task
    * with interrupts disabled.
@@@ -2299,10 -2225,6 +2299,10 @@@ void __perf_event_task_sched_in(struct 
          */
         if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
                 perf_cgroup_sched_in(prev, task);
+ +
+ +      /* check for system-wide branch_stack events */
+ +      if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+ +              perf_branch_stack_sched_in(prev, task);
   }
   
   static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@@ -2378,10 -2300,7 +2378,10 @@@ do {                                  
         return div64_u64(dividend, divisor);
   }
   
- -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
+ +static DEFINE_PER_CPU(int, perf_throttled_count);
+ +static DEFINE_PER_CPU(u64, perf_throttled_seq);
+ +
+ +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
   {
         struct hw_perf_event *hwc = &event->hw;
         s64 period, sample_period;
@@@ -2400,40 -2319,22 +2400,40 @@@
         hwc->sample_period = sample_period;
   
         if (local64_read(&hwc->period_left) > 8*sample_period) {
- -              event->pmu->stop(event, PERF_EF_UPDATE);
+ +              if (disable)
+ +                      event->pmu->stop(event, PERF_EF_UPDATE);
+ +
                 local64_set(&hwc->period_left, 0);
- -              event->pmu->start(event, PERF_EF_RELOAD);
+ +
+ +              if (disable)
+ +                      event->pmu->start(event, PERF_EF_RELOAD);
         }
   }
   
- -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
+ +/*
+ + * combine freq adjustment with unthrottling to avoid two passes over the
+ + * events. At the same time, make sure, having freq events does not change
+ + * the rate of unthrottling as that would introduce bias.
+ + */
+ +static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
+ +                                         int needs_unthr)
   {
         struct perf_event *event;
         struct hw_perf_event *hwc;
- -      u64 interrupts, now;
+ +      u64 now, period = TICK_NSEC;
         s64 delta;
   
- -      if (!ctx->nr_freq)
+ +      /*
+ +       * only need to iterate over all events iff:
+ +       * - context have events in frequency mode (needs freq adjust)
+ +       * - there are events to unthrottle on this cpu
+ +       */
+ +      if (!(ctx->nr_freq || needs_unthr))
                 return;
   
+ +      raw_spin_lock(&ctx->lock);
+ +      perf_pmu_disable(ctx->pmu);
+ +
         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                 if (event->state != PERF_EVENT_STATE_ACTIVE)
                         continue;
@@@ -2443,8 -2344,13 +2443,8 @@@
   
                 hwc = &event->hw;
   
- -              interrupts = hwc->interrupts;
- -              hwc->interrupts = 0;
- -
- -              /*
- -               * unthrottle events on the tick
- -               */
- -              if (interrupts == MAX_INTERRUPTS) {
+ +              if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+ +                      hwc->interrupts = 0;
                         perf_log_throttle(event, 1);
                         event->pmu->start(event, 0);
                 }
@@@ -2452,30 -2358,14 +2452,30 @@@
                 if (!event->attr.freq || !event->attr.sample_freq)
                         continue;
   
- -              event->pmu->read(event);
+ +              /*
+ +               * stop the event and update event->count
+ +               */
+ +              event->pmu->stop(event, PERF_EF_UPDATE);
+ +
                 now = local64_read(&event->count);
                 delta = now - hwc->freq_count_stamp;
                 hwc->freq_count_stamp = now;
   
+ +              /*
+ +               * restart the event
+ +               * reload only if value has changed
+ +               * we have stopped the event so tell that
+ +               * to perf_adjust_period() to avoid stopping it
+ +               * twice.
+ +               */
                 if (delta > 0)
- -                      perf_adjust_period(event, period, delta);
+ +                      perf_adjust_period(event, period, delta, false);
+ +
+ +              event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
         }
+ +
+ +      perf_pmu_enable(ctx->pmu);
+ +      raw_spin_unlock(&ctx->lock);
   }
   
   /*
@@@ -2498,13 -2388,16 +2498,13 @@@ static void rotate_ctx(struct perf_even
    */
   static void perf_rotate_context(struct perf_cpu_context *cpuctx)
   {
- -      u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
         struct perf_event_context *ctx = NULL;
- -      int rotate = 0, remove = 1, freq = 0;
+ +      int rotate = 0, remove = 1;
   
         if (cpuctx->ctx.nr_events) {
                 remove = 0;
                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                         rotate = 1;
- -              if (cpuctx->ctx.nr_freq)
- -                      freq = 1;
         }
   
         ctx = cpuctx->task_ctx;
@@@ -2512,26 -2405,37 +2512,26 @@@
                 remove = 0;
                 if (ctx->nr_events != ctx->nr_active)
                         rotate = 1;
- -              if (ctx->nr_freq)
- -                      freq = 1;
         }
   
- -      if (!rotate && !freq)
+ +      if (!rotate)
                 goto done;
   
         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
         perf_pmu_disable(cpuctx->ctx.pmu);
   
- -      if (freq) {
- -              perf_ctx_adjust_freq(&cpuctx->ctx, interval);
- -              if (ctx)
- -                      perf_ctx_adjust_freq(ctx, interval);
- -      }
- -
- -      if (rotate) {
- -              cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- -              if (ctx)
- -                      ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ +      cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ +      if (ctx)
+ +              ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
   
- -              rotate_ctx(&cpuctx->ctx);
- -              if (ctx)
- -                      rotate_ctx(ctx);
+ +      rotate_ctx(&cpuctx->ctx);
+ +      if (ctx)
+ +              rotate_ctx(ctx);
   
- -              perf_event_sched_in(cpuctx, ctx, current);
- -      }
+ +      perf_event_sched_in(cpuctx, ctx, current);
   
         perf_pmu_enable(cpuctx->ctx.pmu);
         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- -
   done:
         if (remove)
                 list_del_init(&cpuctx->rotation_list);
@@@ -2541,22 -2445,10 +2541,22 @@@ void perf_event_task_tick(void
   {
         struct list_head *head = &__get_cpu_var(rotation_list);
         struct perf_cpu_context *cpuctx, *tmp;
+ +      struct perf_event_context *ctx;
+ +      int throttled;
   
         WARN_ON(!irqs_disabled());
   
+ +      __this_cpu_inc(perf_throttled_seq);
+ +      throttled = __this_cpu_xchg(perf_throttled_count, 0);
+ +
         list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+ +              ctx = &cpuctx->ctx;
+ +              perf_adjust_freq_unthr_context(ctx, throttled);
+ +
+ +              ctx = cpuctx->task_ctx;
+ +              if (ctx)
+ +                      perf_adjust_freq_unthr_context(ctx, throttled);
+ +
                 if (cpuctx->jiffies_interval == 1 ||
                                 !(jiffies % cpuctx->jiffies_interval))
                         perf_rotate_context(cpuctx);
@@@ -2856,7 -2748,7 +2856,7 @@@ static void free_event(struct perf_even
   
         if (!event->parent) {
                 if (event->attach_state & PERF_ATTACH_TASK)
- -                      jump_label_dec_deferred(&perf_sched_events);
+ +                      static_key_slow_dec_deferred(&perf_sched_events);
                 if (event->attr.mmap || event->attr.mmap_data)
                         atomic_dec(&nr_mmap_events);
                 if (event->attr.comm)
@@@ -2867,15 -2759,7 +2867,15 @@@
                         put_callchain_buffers();
                 if (is_cgroup_event(event)) {
                         atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
- -                      jump_label_dec_deferred(&perf_sched_events);
+ +                      static_key_slow_dec_deferred(&perf_sched_events);
+ +              }
+ +
+ +              if (has_branch_stack(event)) {
+ +                      static_key_slow_dec_deferred(&perf_sched_events);
+ +                      /* is system-wide event */
+ +                      if (!(event->attach_state & PERF_ATTACH_TASK))
+ +                              atomic_dec(&per_cpu(perf_branch_stack_events,
+ +                                                  event->cpu));
                 }
         }
   
@@@ -3324,6 -3208,10 +3324,6 @@@ int perf_event_task_disable(void
         return 0;
   }
   
- -#ifndef PERF_EVENT_INDEX_OFFSET
- -# define PERF_EVENT_INDEX_OFFSET 0
- -#endif
- -
   static int perf_event_index(struct perf_event *event)
   {
         if (event->hw.state & PERF_HES_STOPPED)
@@@ -3332,26 -3220,21 +3332,26 @@@
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return 0;
   
- -      return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+ +      return event->pmu->event_idx(event);
   }
   
   static void calc_timer_values(struct perf_event *event,
+ +                              u64 *now,
                                 u64 *enabled,
                                 u64 *running)
   {
- -      u64 now, ctx_time;
+ +      u64 ctx_time;
   
- -      now = perf_clock();
- -      ctx_time = event->shadow_ctx_time + now;
+ +      *now = perf_clock();
+ +      ctx_time = event->shadow_ctx_time + *now;
         *enabled = ctx_time - event->tstamp_enabled;
         *running = ctx_time - event->tstamp_running;
   }
   
+ +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+ +{
+ +}
+ +
   /*
    * Callers need to ensure there can be no nesting of this function, otherwise
    * the seqlock logic goes bad. We can not serialize this because the arch
@@@ -3361,7 -3244,7 +3361,7 @@@ void perf_event_update_userpage(struct 
   {
         struct perf_event_mmap_page *userpg;
         struct ring_buffer *rb;
- -      u64 enabled, running;
+ +      u64 enabled, running, now;
   
         rcu_read_lock();
         /*
@@@ -3373,7 -3256,7 +3373,7 @@@
          * because of locking issue as we can be called in
          * NMI context
          */
- -      calc_timer_values(event, &enabled, &running);
+ +      calc_timer_values(event, &now, &enabled, &running);
         rb = rcu_dereference(event->rb);
         if (!rb)
                 goto unlock;
@@@ -3389,7 -3272,7 +3389,7 @@@
         barrier();
         userpg->index = perf_event_index(event);
         userpg->offset = perf_event_count(event);
- -      if (event->state == PERF_EVENT_STATE_ACTIVE)
+ +      if (userpg->index)
                 userpg->offset -= local64_read(&event->hw.prev_count);
   
         userpg->time_enabled = enabled +
@@@ -3398,8 -3281,6 +3398,8 @@@
         userpg->time_running = running +
                         atomic64_read(&event->child_total_time_running);
   
+ +      perf_update_user_clock(userpg, now);
+ +
         barrier();
         ++userpg->lock;
         preempt_enable();
@@@ -3657,8 -3538,6 +3657,8 @@@ static int perf_mmap(struct file *file
         event->mmap_user = get_current_user();
         vma->vm_mm->pinned_vm += event->mmap_locked;
   
+ +      perf_event_update_userpage(event);
+ +
   unlock:
         if (!ret)
                 atomic_inc(&event->mmap_count);
@@@ -3890,7 -3769,7 +3890,7 @@@ static void perf_output_read_group(stru
   static void perf_output_read(struct perf_output_handle *handle,
                              struct perf_event *event)
   {
- -      u64 enabled = 0, running = 0;
+ +      u64 enabled = 0, running = 0, now;
         u64 read_format = event->attr.read_format;
   
         /*
@@@ -3903,7 -3782,7 +3903,7 @@@
          * NMI context
          */
         if (read_format & PERF_FORMAT_TOTAL_TIMES)
- -              calc_timer_values(event, &enabled, &running);
+ +              calc_timer_values(event, &now, &enabled, &running);
   
         if (event->attr.read_format & PERF_FORMAT_GROUP)
                 perf_output_read_group(handle, event, enabled, running);
@@@ -3993,24 -3872,6 +3993,24 @@@ void perf_output_sample(struct perf_out
                         }
                 }
         }
+ +
+ +      if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ +              if (data->br_stack) {
+ +                      size_t size;
+ +
+ +                      size = data->br_stack->nr
+ +                           * sizeof(struct perf_branch_entry);
+ +
+ +                      perf_output_put(handle, data->br_stack->nr);
+ +                      perf_output_copy(handle, data->br_stack->entries, size);
+ +              } else {
+ +                      /*
+ +                       * we always store at least the value of nr
+ +                       */
+ +                      u64 nr = 0;
+ +                      perf_output_put(handle, nr);
+ +              }
+ +      }
   }
   
   void perf_prepare_sample(struct perf_event_header *header,
@@@ -4053,15 -3914,6 +4053,15 @@@
                 WARN_ON_ONCE(size & (sizeof(u64)-1));
                 header->size += size;
         }
+ +
+ +      if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ +              int size = sizeof(u64); /* nr */
+ +              if (data->br_stack) {
+ +                      size += data->br_stack->nr
+ +                            * sizeof(struct perf_branch_entry);
+ +              }
+ +              header->size += size;
+ +      }
   }
   
   static void perf_event_output(struct perf_event *event,
@@@ -4657,7 -4509,6 +4657,7 @@@ static int __perf_event_overflow(struc
   {
         int events = atomic_read(&event->event_limit);
         struct hw_perf_event *hwc = &event->hw;
+ +      u64 seq;
         int ret = 0;
   
         /*
@@@ -4667,20 -4518,14 +4667,20 @@@
         if (unlikely(!is_sampling_event(event)))
                 return 0;
   
- -      if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
- -              if (throttle) {
+ +      seq = __this_cpu_read(perf_throttled_seq);
+ +      if (seq != hwc->interrupts_seq) {
+ +              hwc->interrupts_seq = seq;
+ +              hwc->interrupts = 1;
+ +      } else {
+ +              hwc->interrupts++;
+ +              if (unlikely(throttle
+ +                           && hwc->interrupts >= max_samples_per_tick)) {
+ +                      __this_cpu_inc(perf_throttled_count);
                         hwc->interrupts = MAX_INTERRUPTS;
                         perf_log_throttle(event, 0);
                         ret = 1;
                 }
- -      } else
- -              hwc->interrupts++;
+ +      }
   
         if (event->attr.freq) {
                 u64 now = perf_clock();
@@@ -4689,7 -4534,7 +4689,7 @@@
                 hwc->freq_time_stamp = now;
   
                 if (delta > 0 && delta < 2*TICK_NSEC)
- -                      perf_adjust_period(event, delta, hwc->last_period);
+ +                      perf_adjust_period(event, delta, hwc->last_period, true);
         }
   
         /*
@@@ -5104,7 -4949,7 +5104,7 @@@ fail
         return err;
   }
   
- -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+ +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
   
   static void sw_perf_event_destroy(struct perf_event *event)
   {
@@@ -5112,7 -4957,7 +5112,7 @@@
   
         WARN_ON(event->parent);
   
- -      jump_label_dec(&perf_swevent_enabled[event_id]);
+ +      static_key_slow_dec(&perf_swevent_enabled[event_id]);
         swevent_hlist_put(event);
   }
   
@@@ -5123,12 -4968,6 +5123,12 @@@ static int perf_swevent_init(struct per
         if (event->attr.type != PERF_TYPE_SOFTWARE)
                 return -ENOENT;
   
+ +      /*
+ +       * no branch sampling for software events
+ +       */
+ +      if (has_branch_stack(event))
+ +              return -EOPNOTSUPP;
+ +
         switch (event_id) {
         case PERF_COUNT_SW_CPU_CLOCK:
         case PERF_COUNT_SW_TASK_CLOCK:
@@@ -5148,18 -4987,13 +5148,18 @@@
                 if (err)
                         return err;
   
- -              jump_label_inc(&perf_swevent_enabled[event_id]);
+ +              static_key_slow_inc(&perf_swevent_enabled[event_id]);
                 event->destroy = sw_perf_event_destroy;
         }
   
         return 0;
   }
   
+ +static int perf_swevent_event_idx(struct perf_event *event)
+ +{
+ +      return 0;
+ +}
+ +
   static struct pmu perf_swevent = {
         .task_ctx_nr    = perf_sw_context,
   
@@@ -5169,8 -5003,6 +5169,8 @@@
         .start          = perf_swevent_start,
         .stop           = perf_swevent_stop,
         .read           = perf_swevent_read,
+ +
+ +      .event_idx      = perf_swevent_event_idx,
   };
   
   #ifdef CONFIG_EVENT_TRACING
@@@ -5239,12 -5071,6 +5239,12 @@@ static int perf_tp_event_init(struct pe
         if (event->attr.type != PERF_TYPE_TRACEPOINT)
                 return -ENOENT;
   
+ +      /*
+ +       * no branch sampling for tracepoint events
+ +       */
+ +      if (has_branch_stack(event))
+ +              return -EOPNOTSUPP;
+ +
         err = perf_trace_init(event);
         if (err)
                 return err;
@@@ -5263,8 -5089,6 +5263,8 @@@ static struct pmu perf_tracepoint = 
         .start          = perf_swevent_start,
         .stop           = perf_swevent_stop,
         .read           = perf_swevent_read,
+ +
+ +      .event_idx      = perf_swevent_event_idx,
   };
   
   static inline void perf_tp_register(void)
@@@ -5470,12 -5294,6 +5470,12 @@@ static int cpu_clock_event_init(struct 
         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                 return -ENOENT;
   
+ +      /*
+ +       * no branch sampling for software events
+ +       */
+ +      if (has_branch_stack(event))
+ +              return -EOPNOTSUPP;
+ +
         perf_swevent_init_hrtimer(event);
   
         return 0;
@@@ -5490,8 -5308,6 +5490,8 @@@ static struct pmu perf_cpu_clock = 
         .start          = cpu_clock_event_start,
         .stop           = cpu_clock_event_stop,
         .read           = cpu_clock_event_read,
+ +
+ +      .event_idx      = perf_swevent_event_idx,
   };
   
   /*
@@@ -5550,12 -5366,6 +5550,12 @@@ static int task_clock_event_init(struc
         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                 return -ENOENT;
   
+ +      /*
+ +       * no branch sampling for software events
+ +       */
+ +      if (has_branch_stack(event))
+ +              return -EOPNOTSUPP;
+ +
         perf_swevent_init_hrtimer(event);
   
         return 0;
@@@ -5570,8 -5380,6 +5570,8 @@@ static struct pmu perf_task_clock = 
         .start          = task_clock_event_start,
         .stop           = task_clock_event_stop,
         .read           = task_clock_event_read,
+ +
+ +      .event_idx      = perf_swevent_event_idx,
   };
   
   static void perf_pmu_nop_void(struct pmu *pmu)
@@@ -5599,11 -5407,6 +5599,11 @@@ static void perf_pmu_cancel_txn(struct 
         perf_pmu_enable(pmu);
   }
   
+ +static int perf_event_idx_default(struct perf_event *event)
+ +{
+ +      return event->hw.idx + 1;
+ +}
+ +
   /*
    * Ensures all contexts with the same task_ctx_nr have the same
    * pmu_cpu_context too.
@@@ -5690,7 -5493,6 +5690,7 @@@ static int pmu_dev_alloc(struct pmu *pm
         if (!pmu->dev)
                 goto out;
   
+ +      pmu->dev->groups = pmu->attr_groups;
         device_initialize(pmu->dev);
         ret = dev_set_name(pmu->dev, "%s", pmu->name);
         if (ret)
@@@ -5794,9 -5596,6 +5794,9 @@@ got_cpu_context
                 pmu->pmu_disable = perf_pmu_nop_void;
         }
   
+ +      if (!pmu->event_idx)
+ +              pmu->event_idx = perf_event_idx_default;
+ +
         list_add_rcu(&pmu->entry, &pmus);
         ret = 0;
   unlock:
@@@ -5989,7 -5788,7 +5989,7 @@@ done
   
         if (!event->parent) {
                 if (event->attach_state & PERF_ATTACH_TASK)
- -                      jump_label_inc(&perf_sched_events.key);
+ +                      static_key_slow_inc(&perf_sched_events.key);
                 if (event->attr.mmap || event->attr.mmap_data)
                         atomic_inc(&nr_mmap_events);
                 if (event->attr.comm)
@@@ -6003,12 -5802,6 +6003,12 @@@
                                 return ERR_PTR(err);
                         }
                 }
+ +              if (has_branch_stack(event)) {
+ +                      static_key_slow_inc(&perf_sched_events.key);
+ +                      if (!(event->attach_state & PERF_ATTACH_TASK))
+ +                              atomic_inc(&per_cpu(perf_branch_stack_events,
+ +                                                  event->cpu));
+ +              }
         }
   
         return event;
@@@ -6078,40 -5871,6 +6078,40 @@@ static int perf_copy_attr(struct perf_e
         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                 return -EINVAL;
   
+ +      if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+ +              u64 mask = attr->branch_sample_type;
+ +
+ +              /* only using defined bits */
+ +              if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+ +                      return -EINVAL;
+ +
+ +              /* at least one branch bit must be set */
+ +              if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+ +                      return -EINVAL;
+ +
+ +              /* kernel level capture: check permissions */
+ +              if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+ +                  && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ +                      return -EACCES;
+ +
+ +              /* propagate priv level, when not set for branch */
+ +              if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+ +
+ +                      /* exclude_kernel checked on syscall entry */
+ +                      if (!attr->exclude_kernel)
+ +                              mask |= PERF_SAMPLE_BRANCH_KERNEL;
+ +
+ +                      if (!attr->exclude_user)
+ +                              mask |= PERF_SAMPLE_BRANCH_USER;
+ +
+ +                      if (!attr->exclude_hv)
+ +                              mask |= PERF_SAMPLE_BRANCH_HV;
+ +                      /*
+ +                       * adjust user setting (for HW filter setup)
+ +                       */
+ +                      attr->branch_sample_type = mask;
+ +              }
+ +      }
   out:
         return ret;
   
@@@ -6267,7 -6026,7 +6267,7 @@@ SYSCALL_DEFINE5(perf_event_open
                  * - that may need work on context switch
                  */
                 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
- -              jump_label_inc(&perf_sched_events.key);
+ +              static_key_slow_inc(&perf_sched_events.key);
         }
   
         /*
@@@ -7147,8 -6906,7 +7147,7 @@@ unlock
   device_initcall(perf_event_sysfs_init);
   
   #ifdef CONFIG_CGROUP_PERF
- static struct cgroup_subsys_state *perf_cgroup_create(
-       struct cgroup_subsys *ss, struct cgroup *cont)
+ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
   {
         struct perf_cgroup *jc;
   
@@@ -7165,8 -6923,7 +7164,7 @@@
         return &jc->css;
   }
   
- static void perf_cgroup_destroy(struct cgroup_subsys *ss,
-                               struct cgroup *cont)
+ static void perf_cgroup_destroy(struct cgroup *cont)
   {
         struct perf_cgroup *jc;
         jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@@ -7182,8 -6939,7 +7180,7 @@@ static int __perf_cgroup_move(void *inf
         return 0;
   }
   
- static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                              struct cgroup_taskset *tset)
+ static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
   {
         struct task_struct *task;
   
@@@ -7191,8 -6947,8 +7188,8 @@@
                 task_function_call(task, __perf_cgroup_move, task);
   }
   
- static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+                            struct task_struct *task)
   {
         /*
          * cgroup_exit() is called in the copy_process() failure path.
diff --combined kernel/sched/core.c

index d2bd4647586ceeddcd08fff2f5cc417d42185d65,ff12f72160625f7bfb906ffadea536b04ea1e59c..a35cb8dbd8c47f1afef3b3eafcaa1e5d9ac8994c
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -74,7 -74,6 +74,7 @@@
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
+ +#include <asm/mutex.h>
   #ifdef CONFIG_PARAVIRT
   #include <asm/paravirt.h>
   #endif
@@@ -162,13 -161,13 +162,13 @@@ static int sched_feat_show(struct seq_f
   
   #ifdef HAVE_JUMP_LABEL
   
- -#define jump_label_key__true  jump_label_key_enabled
- -#define jump_label_key__false jump_label_key_disabled
+ +#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+ +#define jump_label_key__false STATIC_KEY_INIT_FALSE
   
   #define SCHED_FEAT(name, enabled)     \
         jump_label_key__##enabled ,
   
- -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+ +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
   #include "features.h"
   };
   
@@@ -176,14 -175,14 +176,14 @@@
   
   static void sched_feat_disable(int i)
   {
- -      if (jump_label_enabled(&sched_feat_keys[i]))
- -              jump_label_dec(&sched_feat_keys[i]);
+ +      if (static_key_enabled(&sched_feat_keys[i]))
+ +              static_key_slow_dec(&sched_feat_keys[i]);
   }
   
   static void sched_feat_enable(int i)
   {
- -      if (!jump_label_enabled(&sched_feat_keys[i]))
- -              jump_label_inc(&sched_feat_keys[i]);
+ +      if (!static_key_enabled(&sched_feat_keys[i]))
+ +              static_key_slow_inc(&sched_feat_keys[i]);
   }
   #else
   static void sched_feat_disable(int i) { };
@@@ -724,6 -723,9 +724,6 @@@ static void dequeue_task(struct rq *rq
         p->sched_class->dequeue_task(rq, p, flags);
   }
   
- -/*
- - * activate_task - move a task to the runqueue.
- - */
   void activate_task(struct rq *rq, struct task_struct *p, int flags)
   {
         if (task_contributes_to_load(p))
@@@ -732,6 -734,9 +732,6 @@@
         enqueue_task(rq, p, flags);
   }
   
- -/*
- - * deactivate_task - remove a task from the runqueue.
- - */
   void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
   {
         if (task_contributes_to_load(p))
@@@ -894,7 -899,7 +894,7 @@@ static void update_rq_clock_task(struc
         delta -= irq_delta;
   #endif
   #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- -      if (static_branch((&paravirt_steal_rq_enabled))) {
+ +      if (static_key_false((&paravirt_steal_rq_enabled))) {
                 u64 st;
   
                 steal = paravirt_steal_clock(cpu_of(rq));
@@@ -1284,7 -1289,7 +1284,7 @@@ static int select_fallback_rq(int cpu, 
          * leave kernel.
          */
         if (p->mm && printk_ratelimit()) {
- -              printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+ +              printk_sched("process %d (%s) no longer affine to cpu%d\n",
                                 task_pid_nr(p), p->comm, cpu);
         }
   
@@@ -1507,7 -1512,7 +1507,7 @@@ static int ttwu_activate_remote(struct 
   }
   #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
   
- -static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+ +bool cpus_share_cache(int this_cpu, int that_cpu)
   {
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
   }
@@@ -1518,7 -1523,7 +1518,7 @@@ static void ttwu_queue(struct task_stru
         struct rq *rq = cpu_rq(cpu);
   
   #if defined(CONFIG_SMP)
- -      if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
+ +      if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
                 ttwu_queue_remote(p, cpu);
                 return;
@@@ -1932,6 -1937,7 +1932,6 @@@ static void finish_task_switch(struct r
         local_irq_enable();
   #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
- -      trace_sched_stat_sleeptime(current, rq->clock);
   
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@@ -2266,10 -2272,13 +2266,10 @@@ calc_load_n(unsigned long load, unsigne
    * Once we've updated the global active value, we need to apply the exponential
    * weights adjusted to the number of cycles missed.
    */
- -static void calc_global_nohz(unsigned long ticks)
+ +static void calc_global_nohz(void)
   {
         long delta, active, n;
   
- -      if (time_before(jiffies, calc_load_update))
- -              return;
- -
         /*
          * If we crossed a calc_load_update boundary, make sure to fold
          * any pending idle changes, the respective CPUs might have
@@@ -2281,25 -2290,31 +2281,25 @@@
                 atomic_long_add(delta, &calc_load_tasks);
   
         /*
- -       * If we were idle for multiple load cycles, apply them.
+ +       * It could be the one fold was all it took, we done!
          */
- -      if (ticks >= LOAD_FREQ) {
- -              n = ticks / LOAD_FREQ;
+ +      if (time_before(jiffies, calc_load_update + 10))
+ +              return;
   
- -              active = atomic_long_read(&calc_load_tasks);
- -              active = active > 0 ? active * FIXED_1 : 0;
+ +      /*
+ +       * Catch-up, fold however many we are behind still
+ +       */
+ +      delta = jiffies - calc_load_update - 10;
+ +      n = 1 + (delta / LOAD_FREQ);
   
- -              avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
- -              avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
- -              avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+ +      active = atomic_long_read(&calc_load_tasks);
+ +      active = active > 0 ? active * FIXED_1 : 0;
   
- -              calc_load_update += n * LOAD_FREQ;
- -      }
+ +      avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ +      avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ +      avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
   
- -      /*
- -       * Its possible the remainder of the above division also crosses
- -       * a LOAD_FREQ period, the regular check in calc_global_load()
- -       * which comes after this will take care of that.
- -       *
- -       * Consider us being 11 ticks before a cycle completion, and us
- -       * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
- -       * age us 4 cycles, and the test in calc_global_load() will
- -       * pick up the final one.
- -       */
+ +      calc_load_update += n * LOAD_FREQ;
   }
   #else
   void calc_load_account_idle(struct rq *this_rq)
@@@ -2311,7 -2326,7 +2311,7 @@@ static inline long calc_load_fold_idle(
         return 0;
   }
   
- -static void calc_global_nohz(unsigned long ticks)
+ +static void calc_global_nohz(void)
   {
   }
   #endif
@@@ -2339,6 -2354,8 +2339,6 @@@ void calc_global_load(unsigned long tic
   {
         long active;
   
- -      calc_global_nohz(ticks);
- -
         if (time_before(jiffies, calc_load_update + 10))
                 return;
   
@@@ -2350,16 -2367,6 +2350,16 @@@
         avenrun[2] = calc_load(avenrun[2], EXP_15, active);
   
         calc_load_update += LOAD_FREQ;
+ +
+ +      /*
+ +       * Account one period with whatever state we found before
+ +       * folding in the nohz state and ageing the entire idle period.
+ +       *
+ +       * This avoids loosing a sample when we go idle between 
+ +       * calc_load_account_active() (10 ticks ago) and now and thus
+ +       * under-accounting.
+ +       */
+ +      calc_global_nohz();
   }
   
   /*
@@@ -2754,7 -2761,7 +2754,7 @@@ void account_idle_time(cputime_t cputim
   static __always_inline bool steal_account_process_tick(void)
   {
   #ifdef CONFIG_PARAVIRT
- -      if (static_branch(&paravirt_steal_enabled)) {
+ +      if (static_key_false(&paravirt_steal_enabled)) {
                 u64 steal, st = 0;
   
                 steal = paravirt_steal_clock(smp_processor_id());
@@@ -3219,14 -3226,14 +3219,14 @@@ need_resched
   
         post_schedule(rq);
   
- -      preempt_enable_no_resched();
+ +      sched_preempt_enable_no_resched();
         if (need_resched())
                 goto need_resched;
   }
   
   static inline void sched_submit_work(struct task_struct *tsk)
   {
- -      if (!tsk->state)
+ +      if (!tsk->state || tsk_is_pi_blocked(tsk))
                 return;
         /*
          * If we are going to sleep and we have plugged IO queued,
@@@ -3245,18 -3252,6 +3245,18 @@@ asmlinkage void __sched schedule(void
   }
   EXPORT_SYMBOL(schedule);
   
+ +/**
+ + * schedule_preempt_disabled - called with preemption disabled
+ + *
+ + * Returns with preemption disabled. Note: preempt_count must be 1
+ + */
+ +void __sched schedule_preempt_disabled(void)
+ +{
+ +      sched_preempt_enable_no_resched();
+ +      schedule();
+ +      preempt_disable();
+ +}
+ +
   #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
   
   static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@@ -3417,9 -3412,9 +3417,9 @@@ EXPORT_SYMBOL(__wake_up)
   /*
    * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
    */
- -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
   {
- -      __wake_up_common(q, mode, 1, 0, NULL);
+ +      __wake_up_common(q, mode, nr, 0, NULL);
   }
   EXPORT_SYMBOL_GPL(__wake_up_locked);
   
@@@ -3778,24 -3773,6 +3778,24 @@@ void rt_mutex_setprio(struct task_struc
   
         rq = __task_rq_lock(p);
   
+ +      /*
+ +       * Idle task boosting is a nono in general. There is one
+ +       * exception, when PREEMPT_RT and NOHZ is active:
+ +       *
+ +       * The idle task calls get_next_timer_interrupt() and holds
+ +       * the timer wheel base->lock on the CPU and another CPU wants
+ +       * to access the timer (probably to cancel it). We can safely
+ +       * ignore the boosting request, as the idle CPU runs this code
+ +       * with interrupts disabled and will complete the lock
+ +       * protected section without being interrupted. So there is no
+ +       * real need to boost.
+ +       */
+ +      if (unlikely(p == rq->idle)) {
+ +              WARN_ON(p != rq->curr);
+ +              WARN_ON(p->pi_blocked_on);
+ +              goto out_unlock;
+ +      }
+ +
         trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
@@@ -3819,10 -3796,11 +3819,10 @@@
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
   
         check_class_changed(rq, p, prev_class, oldprio);
+ +out_unlock:
         __task_rq_unlock(rq);
   }
- -
   #endif
- -
   void set_user_nice(struct task_struct *p, long nice)
   {
         int old_prio, delta, on_rq;
@@@ -4156,7 -4134,7 +4156,7 @@@ recheck
         on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
- -              deactivate_task(rq, p, 0);
+ +              dequeue_task(rq, p, 0);
         if (running)
                 p->sched_class->put_prev_task(rq, p);
   
@@@ -4169,7 -4147,7 +4169,7 @@@
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq)
- -              activate_task(rq, p, 0);
+ +              enqueue_task(rq, p, 0);
   
         check_class_changed(rq, p, prev_class, oldprio);
         task_rq_unlock(rq, p, &flags);
@@@ -4502,7 -4480,7 +4502,7 @@@ SYSCALL_DEFINE0(sched_yield
         __release(rq->lock);
         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
         do_raw_spin_unlock(&rq->lock);
- -      preempt_enable_no_resched();
+ +      sched_preempt_enable_no_resched();
   
         schedule();
   
@@@ -4576,24 -4554,8 +4576,24 @@@ EXPORT_SYMBOL(__cond_resched_softirq)
   /**
    * yield - yield the current processor to other threads.
    *
- - * This is a shortcut for kernel-space yielding - it marks the
- - * thread runnable and calls sys_sched_yield().
+ + * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ + *
+ + * The scheduler is at all times free to pick the calling task as the most
+ + * eligible task to run, if removing the yield() call from your code breaks
+ + * it, its already broken.
+ + *
+ + * Typical broken usage is:
+ + *
+ + * while (!event)
+ + *    yield();
+ + *
+ + * where one assumes that yield() will let 'the other' process run that will
+ + * make event true. If the current task is a SCHED_FIFO task that will never
+ + * happen. Never use yield() as a progress guarantee!!
+ + *
+ + * If you want to use yield() to wait for something, use wait_event().
+ + * If you want to use yield() to be 'nice' for others, use cond_resched().
+ + * If you still want to use yield(), do not!
    */
   void __sched yield(void)
   {
@@@ -5036,9 -4998,9 +5036,9 @@@ static int __migrate_task(struct task_s
          * placed properly.
          */
         if (p->on_rq) {
- -              deactivate_task(rq_src, p, 0);
+ +              dequeue_task(rq_src, p, 0);
                 set_task_cpu(p, dest_cpu);
- -              activate_task(rq_dest, p, 0);
+ +              enqueue_task(rq_dest, p, 0);
                 check_preempt_curr(rq_dest, p, 0);
         }
   done:
@@@ -5425,7 -5387,7 +5425,7 @@@ static int __cpuinit sched_cpu_active(s
                                       unsigned long action, void *hcpu)
   {
         switch (action & ~CPU_TASKS_FROZEN) {
- -      case CPU_ONLINE:
+ +      case CPU_STARTING:
         case CPU_DOWN_FAILED:
                 set_cpu_active((long)hcpu, true);
                 return NOTIFY_OK;
@@@ -5797,7 -5759,7 +5797,7 @@@ static void destroy_sched_domains(struc
    *
    * Also keep a unique ID per domain (we use the first cpu number in
    * the cpumask of the domain), this allows us to quickly tell if
- - * two cpus are in the same cache domain, see ttwu_share_cache().
+ + * two cpus are in the same cache domain, see cpus_share_cache().
    */
   DEFINE_PER_CPU(struct sched_domain *, sd_llc);
   DEFINE_PER_CPU(int, sd_llc_id);
@@@ -6974,9 -6936,6 +6974,9 @@@ void __init sched_init(void
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
+ +
+ +              INIT_LIST_HEAD(&rq->cfs_tasks);
+ +
                 rq_attach_root(rq, &def_root_domain);
   #ifdef CONFIG_NO_HZ
                 rq->nohz_flags = 0;
@@@ -7073,10 -7032,10 +7073,10 @@@ static void normalize_task(struct rq *r
   
         on_rq = p->on_rq;
         if (on_rq)
- -              deactivate_task(rq, p, 0);
+ +              dequeue_task(rq, p, 0);
         __setscheduler(rq, p, SCHED_NORMAL, 0);
         if (on_rq) {
- -              activate_task(rq, p, 0);
+ +              enqueue_task(rq, p, 0);
                 resched_task(rq->curr);
         }
   
@@@ -7571,8 -7530,7 +7571,7 @@@ static inline struct task_group *cgroup
                             struct task_group, css);
   }
   
- static struct cgroup_subsys_state *
- cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
   {
         struct task_group *tg, *parent;
   
@@@ -7589,15 -7547,14 +7588,14 @@@
         return &tg->css;
   }
   
- static void
- cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ static void cpu_cgroup_destroy(struct cgroup *cgrp)
   {
         struct task_group *tg = cgroup_tg(cgrp);
   
         sched_destroy_group(tg);
   }
   
- static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                  struct cgroup_taskset *tset)
   {
         struct task_struct *task;
@@@ -7615,7 -7572,7 +7613,7 @@@
         return 0;
   }
   
- static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ static void cpu_cgroup_attach(struct cgroup *cgrp,
                               struct cgroup_taskset *tset)
   {
         struct task_struct *task;
@@@ -7625,8 -7582,8 +7623,8 @@@
   }
   
   static void
- cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+               struct task_struct *task)
   {
         /*
          * cgroup_exit() is called in the copy_process() failure path.
@@@ -7976,8 -7933,7 +7974,7 @@@ struct cgroup_subsys cpu_cgroup_subsys 
    */
   
   /* create a new cpu accounting group */
- static struct cgroup_subsys_state *cpuacct_create(
-       struct cgroup_subsys *ss, struct cgroup *cgrp)
+ static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
   {
         struct cpuacct *ca;
   
@@@ -8007,8 -7963,7 +8004,7 @@@ out
   }
   
   /* destroy an existing cpu accounting group */
- static void
- cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ static void cpuacct_destroy(struct cgroup *cgrp)
   {
         struct cpuacct *ca = cgroup_ca(cgrp);
   
diff --combined mm/memcontrol.c

index 58a08fc7414aaf4de59b57a924994dfa43452468,ae2f0a8ab7617938c7922756eb1e9fab67142582..26c6f4ec20f43e626f0564776f1b38be8a54cd41
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -230,30 -230,10 +230,30 @@@ struct mem_cgroup 
          * the counter to account for memory usage
          */
         struct res_counter res;
- -      /*
- -       * the counter to account for mem+swap usage.
- -       */
- -      struct res_counter memsw;
+ +
+ +      union {
+ +              /*
+ +               * the counter to account for mem+swap usage.
+ +               */
+ +              struct res_counter memsw;
+ +
+ +              /*
+ +               * rcu_freeing is used only when freeing struct mem_cgroup,
+ +               * so put it into a union to avoid wasting more memory.
+ +               * It must be disjoint from the css field.  It could be
+ +               * in a union with the res field, but res plays a much
+ +               * larger part in mem_cgroup life than memsw, and might
+ +               * be of interest, even at time of free, when debugging.
+ +               * So share rcu_head with the less interesting memsw.
+ +               */
+ +              struct rcu_head rcu_freeing;
+ +              /*
+ +               * But when using vfree(), that cannot be done at
+ +               * interrupt time, so we must then queue the work.
+ +               */
+ +              struct work_struct work_freeing;
+ +      };
+ +
         /*
          * Per cgroup active and inactive list, similar to the
          * per zone LRU lists.
@@@ -399,7 -379,7 +399,7 @@@ static void mem_cgroup_put(struct mem_c
   static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
   void sock_update_memcg(struct sock *sk)
   {
- -      if (static_branch(&memcg_socket_limit_enabled)) {
+ +      if (mem_cgroup_sockets_enabled) {
                 struct mem_cgroup *memcg;
   
                 BUG_ON(!sk->sk_prot->proto_cgroup);
@@@ -431,7 -411,7 +431,7 @@@ EXPORT_SYMBOL(sock_update_memcg)
   
   void sock_release_memcg(struct sock *sk)
   {
- -      if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) {
+ +      if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
                 struct mem_cgroup *memcg;
                 WARN_ON(!sk->sk_cgrp->memcg);
                 memcg = sk->sk_cgrp->memcg;
@@@ -796,8 -776,7 +796,8 @@@ static void memcg_check_events(struct m
         /* threshold event is triggered in finer grain than soft limit */
         if (unlikely(mem_cgroup_event_ratelimit(memcg,
                                                 MEM_CGROUP_TARGET_THRESH))) {
- -              bool do_softlimit, do_numainfo;
+ +              bool do_softlimit;
+ +              bool do_numainfo __maybe_unused;
   
                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
@@@ -1062,19 -1041,6 +1062,19 @@@ struct lruvec *mem_cgroup_lru_add_list(
   
         pc = lookup_page_cgroup(page);
         memcg = pc->mem_cgroup;
+ +
+ +      /*
+ +       * Surreptitiously switch any uncharged page to root:
+ +       * an uncharged page off lru does nothing to secure
+ +       * its former mem_cgroup from sudden removal.
+ +       *
+ +       * Our caller holds lru_lock, and PageCgroupUsed is updated
+ +       * under page_cgroup lock: between them, they make all uses
+ +       * of pc->mem_cgroup safe.
+ +       */
+ +      if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+ +              pc->mem_cgroup = memcg = root_mem_cgroup;
+ +
         mz = page_cgroup_zoneinfo(memcg, page);
         /* compound_order() is stabilized through lru_lock */
         MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
@@@ -2441,12 -2407,8 +2441,12 @@@ static void __mem_cgroup_commit_charge(
                                        struct page *page,
                                        unsigned int nr_pages,
                                        struct page_cgroup *pc,
- -                                     enum charge_type ctype)
+ +                                     enum charge_type ctype,
+ +                                     bool lrucare)
   {
+ +      struct zone *uninitialized_var(zone);
+ +      bool was_on_lru = false;
+ +
         lock_page_cgroup(pc);
         if (unlikely(PageCgroupUsed(pc))) {
                 unlock_page_cgroup(pc);
@@@ -2457,21 -2419,6 +2457,21 @@@
          * we don't need page_cgroup_lock about tail pages, becase they are not
          * accessed by any other context at this point.
          */
+ +
+ +      /*
+ +       * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
+ +       * may already be on some other mem_cgroup's LRU.  Take care of it.
+ +       */
+ +      if (lrucare) {
+ +              zone = page_zone(page);
+ +              spin_lock_irq(&zone->lru_lock);
+ +              if (PageLRU(page)) {
+ +                      ClearPageLRU(page);
+ +                      del_page_from_lru_list(zone, page, page_lru(page));
+ +                      was_on_lru = true;
+ +              }
+ +      }
+ +
         pc->mem_cgroup = memcg;
         /*
          * We access a page_cgroup asynchronously without lock_page_cgroup().
@@@ -2495,18 -2442,9 +2495,18 @@@
                 break;
         }
   
+ +      if (lrucare) {
+ +              if (was_on_lru) {
+ +                      VM_BUG_ON(PageLRU(page));
+ +                      SetPageLRU(page);
+ +                      add_page_to_lru_list(zone, page, page_lru(page));
+ +              }
+ +              spin_unlock_irq(&zone->lru_lock);
+ +      }
+ +
         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
         unlock_page_cgroup(pc);
- -      WARN_ON_ONCE(PageLRU(page));
+ +
         /*
          * "charge_statistics" updated event counter. Then, check it.
          * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@@ -2704,7 -2642,7 +2704,7 @@@ static int mem_cgroup_charge_common(str
         ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
         if (ret == -ENOMEM)
                 return ret;
- -      __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
+ +      __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
         return 0;
   }
   
@@@ -2724,6 -2662,35 +2724,6 @@@ static voi
   __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                                         enum charge_type ctype);
   
- -static void
- -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
- -                                      enum charge_type ctype)
- -{
- -      struct page_cgroup *pc = lookup_page_cgroup(page);
- -      struct zone *zone = page_zone(page);
- -      unsigned long flags;
- -      bool removed = false;
- -
- -      /*
- -       * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
- -       * is already on LRU. It means the page may on some other page_cgroup's
- -       * LRU. Take care of it.
- -       */
- -      spin_lock_irqsave(&zone->lru_lock, flags);
- -      if (PageLRU(page)) {
- -              del_page_from_lru_list(zone, page, page_lru(page));
- -              ClearPageLRU(page);
- -              removed = true;
- -      }
- -      __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
- -      if (removed) {
- -              add_page_to_lru_list(zone, page, page_lru(page));
- -              SetPageLRU(page);
- -      }
- -      spin_unlock_irqrestore(&zone->lru_lock, flags);
- -      return;
- -}
- -
   int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                 gfp_t gfp_mask)
   {
@@@ -2801,16 -2768,13 +2801,16 @@@ static voi
   __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                         enum charge_type ctype)
   {
+ +      struct page_cgroup *pc;
+ +
         if (mem_cgroup_disabled())
                 return;
         if (!memcg)
                 return;
         cgroup_exclude_rmdir(&memcg->css);
   
- -      __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
+ +      pc = lookup_page_cgroup(page);
+ +      __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
         /*
          * Now swap is on-memory. This means this page may be
          * counted both as mem and swap....double count.
@@@ -3062,6 -3026,23 +3062,6 @@@ void mem_cgroup_uncharge_end(void
         batch->memcg = NULL;
   }
   
- -/*
- - * A function for resetting pc->mem_cgroup for newly allocated pages.
- - * This function should be called if the newpage will be added to LRU
- - * before start accounting.
- - */
- -void mem_cgroup_reset_owner(struct page *newpage)
- -{
- -      struct page_cgroup *pc;
- -
- -      if (mem_cgroup_disabled())
- -              return;
- -
- -      pc = lookup_page_cgroup(newpage);
- -      VM_BUG_ON(PageCgroupUsed(pc));
- -      pc->mem_cgroup = root_mem_cgroup;
- -}
- -
   #ifdef CONFIG_SWAP
   /*
    * called after __delete_from_swap_cache() and drop "page" account.
@@@ -3266,7 -3247,7 +3266,7 @@@ int mem_cgroup_prepare_migration(struc
                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
         else
                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- -      __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
+ +      __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
         return ret;
   }
   
@@@ -3350,7 -3331,7 +3350,7 @@@ void mem_cgroup_replace_page_cache(stru
          * the newpage may be on LRU(or pagevec for LRU) already. We lock
          * LRU while we overwrite pc->mem_cgroup.
          */
- -      __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
+ +      __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
   }
   
   #ifdef CONFIG_DEBUG_VM
@@@ -4432,9 -4413,6 +4432,9 @@@ static void mem_cgroup_usage_unregister
          */
         BUG_ON(!thresholds);
   
+ +      if (!thresholds->primary)
+ +              goto unlock;
+ +
         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
   
         /* Check if a threshold crossed before removing */
@@@ -4483,7 -4461,7 +4483,7 @@@ swap_buffers
   
         /* To be sure that nobody uses thresholds */
         synchronize_rcu();
- -
+ +unlock:
         mutex_unlock(&memcg->thresholds_lock);
   }
   
@@@ -4602,10 -4580,9 +4602,9 @@@ static int register_kmem_files(struct c
         return mem_cgroup_sockets_init(cont, ss);
   };
   
- static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
-                               struct cgroup *cont)
+ static void kmem_cgroup_destroy(struct cgroup *cont)
   {
-       mem_cgroup_sockets_destroy(cont, ss);
+       mem_cgroup_sockets_destroy(cont);
   }
   #else
   static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
@@@ -4613,8 -4590,7 +4612,7 @@@
         return 0;
   }
   
- static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
-                               struct cgroup *cont)
+ static void kmem_cgroup_destroy(struct cgroup *cont)
   {
   }
   #endif
@@@ -4799,27 -4775,6 +4797,27 @@@ out_free
         return NULL;
   }
   
+ +/*
+ + * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ + * but in process context.  The work_freeing structure is overlaid
+ + * on the rcu_freeing structure, which itself is overlaid on memsw.
+ + */
+ +static void vfree_work(struct work_struct *work)
+ +{
+ +      struct mem_cgroup *memcg;
+ +
+ +      memcg = container_of(work, struct mem_cgroup, work_freeing);
+ +      vfree(memcg);
+ +}
+ +static void vfree_rcu(struct rcu_head *rcu_head)
+ +{
+ +      struct mem_cgroup *memcg;
+ +
+ +      memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
+ +      INIT_WORK(&memcg->work_freeing, vfree_work);
+ +      schedule_work(&memcg->work_freeing);
+ +}
+ +
   /*
    * At destroying mem_cgroup, references from swap_cgroup can remain.
    * (scanning all at force_empty is too costly...)
@@@ -4843,9 -4798,9 +4841,9 @@@ static void __mem_cgroup_free(struct me
   
         free_percpu(memcg->stat);
         if (sizeof(struct mem_cgroup) < PAGE_SIZE)
- -              kfree(memcg);
+ +              kfree_rcu(memcg, rcu_freeing);
         else
- -              vfree(memcg);
+ +              call_rcu(&memcg->rcu_freeing, vfree_rcu);
   }
   
   static void mem_cgroup_get(struct mem_cgroup *memcg)
@@@ -4927,7 -4882,7 +4925,7 @@@ err_cleanup
   }
   
   static struct cgroup_subsys_state * __ref
- mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+ mem_cgroup_create(struct cgroup *cont)
   {
         struct mem_cgroup *memcg, *parent;
         long error = -ENOMEM;
@@@ -4989,20 -4944,18 +4987,18 @@@ free_out
         return ERR_PTR(error);
   }
   
- static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
-                                       struct cgroup *cont)
+ static int mem_cgroup_pre_destroy(struct cgroup *cont)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
   
         return mem_cgroup_force_empty(memcg, false);
   }
   
- static void mem_cgroup_destroy(struct cgroup_subsys *ss,
-                               struct cgroup *cont)
+ static void mem_cgroup_destroy(struct cgroup *cont)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
   
-       kmem_cgroup_destroy(ss, cont);
+       kmem_cgroup_destroy(cont);
   
         mem_cgroup_put(memcg);
   }
@@@ -5339,9 -5292,8 +5335,8 @@@ static void mem_cgroup_clear_mc(void
         mem_cgroup_end_move(from);
   }
   
- static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-                               struct cgroup *cgroup,
-                               struct cgroup_taskset *tset)
+ static int mem_cgroup_can_attach(struct cgroup *cgroup,
+                                struct cgroup_taskset *tset)
   {
         struct task_struct *p = cgroup_taskset_first(tset);
         int ret = 0;
@@@ -5379,9 -5331,8 +5374,8 @@@
         return ret;
   }
   
- static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-                               struct cgroup *cgroup,
-                               struct cgroup_taskset *tset)
+ static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
+                                    struct cgroup_taskset *tset)
   {
         mem_cgroup_clear_mc();
   }
@@@ -5496,9 -5447,8 +5490,8 @@@ retry
         up_read(&mm->mmap_sem);
   }
   
- static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-                               struct cgroup *cont,
-                               struct cgroup_taskset *tset)
+ static void mem_cgroup_move_task(struct cgroup *cont,
+                                struct cgroup_taskset *tset)
   {
         struct task_struct *p = cgroup_taskset_first(tset);
         struct mm_struct *mm = get_task_mm(p);
@@@ -5513,20 -5463,17 +5506,17 @@@
                 mem_cgroup_clear_mc();
   }
   #else /* !CONFIG_MMU */
- static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-                               struct cgroup *cgroup,
-                               struct cgroup_taskset *tset)
+ static int mem_cgroup_can_attach(struct cgroup *cgroup,
+                                struct cgroup_taskset *tset)
   {
         return 0;
   }
- static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-                               struct cgroup *cgroup,
-                               struct cgroup_taskset *tset)
+ static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
+                                    struct cgroup_taskset *tset)
   {
   }
- static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-                               struct cgroup *cont,
-                               struct cgroup_taskset *tset)
+ static void mem_cgroup_move_task(struct cgroup *cont,
+                                struct cgroup_taskset *tset)
   {
   }
   #endif
diff --combined net/core/netprio_cgroup.c

index 4dacc44637ef8b908d339ab4898cbcf86eb817b9,22036ab732cfbb5c2d99504eb3af05b292ea1be4..ba6900f73900e0985bb65b9543e169717083d8b7
--- 1/net/core/netprio_cgroup.c
--- 2/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@@ -23,9 -23,8 +23,8 @@@
   #include <net/sock.h>
   #include <net/netprio_cgroup.h>
   
- static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-                                              struct cgroup *cgrp);
- static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp);
+ static void cgrp_destroy(struct cgroup *cgrp);
   static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
   
   struct cgroup_subsys net_prio_subsys = {
@@@ -58,12 -57,11 +57,12 @@@ static int get_prioidx(u32 *prio
   
         spin_lock_irqsave(&prioidx_map_lock, flags);
         prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
+ +      if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) {
+ +              spin_unlock_irqrestore(&prioidx_map_lock, flags);
+ +              return -ENOSPC;
+ +      }
         set_bit(prioidx, prioidx_map);
         spin_unlock_irqrestore(&prioidx_map_lock, flags);
- -      if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
- -              return -ENOSPC;
- -
         atomic_set(&max_prioidx, prioidx);
         *prio = prioidx;
         return 0;
@@@ -108,7 -106,7 +107,7 @@@ static void extend_netdev_table(struct 
   static void update_netdev_tables(void)
   {
         struct net_device *dev;
- -      u32 max_len = atomic_read(&max_prioidx);
+ +      u32 max_len = atomic_read(&max_prioidx) + 1;
         struct netprio_map *map;
   
         rtnl_lock();
@@@ -121,8 -119,7 +120,7 @@@
         rtnl_unlock();
   }
   
- static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
-                                                struct cgroup *cgrp)
+ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
   {
         struct cgroup_netprio_state *cs;
         int ret;
@@@ -146,7 -143,7 +144,7 @@@
         return &cs->css;
   }
   
- static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+ static void cgrp_destroy(struct cgroup *cgrp)
   {
         struct cgroup_netprio_state *cs;
         struct net_device *dev;
@@@ -271,6 -268,7 +269,6 @@@ static int netprio_device_event(struct 
   {
         struct net_device *dev = ptr;
         struct netprio_map *old;
- -      u32 max_len = atomic_read(&max_prioidx);
   
         /*
          * Note this is called with rtnl_lock held so we have update side
@@@ -278,6 -276,11 +276,6 @@@
          */
   
         switch (event) {
- -
- -      case NETDEV_REGISTER:
- -              if (max_len)
- -                      extend_netdev_table(dev, max_len);
- -              break;
         case NETDEV_UNREGISTER:
                 old = rtnl_dereference(dev->priomap);
                 RCU_INIT_POINTER(dev->priomap, NULL);
diff --combined net/core/sock.c

index 95aff9c7419b03ab9b16b218823f533d9a2cc042,688037cb3b6e75ca847a245b220fa5409ffb94ad..1fb21b51593b4339e29d99bbb454da4c108ff829
--- 1/net/core/sock.c
--- 2/net/core/sock.c
+++ b/net/core/sock.c
@@@ -111,7 -111,7 +111,7 @@@
   #include <linux/init.h>
   #include <linux/highmem.h>
   #include <linux/user_namespace.h>
- -#include <linux/jump_label.h>
+ +#include <linux/static_key.h>
   #include <linux/memcontrol.h>
   
   #include <asm/uaccess.h>
@@@ -160,19 -160,19 +160,19 @@@ int mem_cgroup_sockets_init(struct cgro
   out:
         list_for_each_entry_continue_reverse(proto, &proto_list, node)
                 if (proto->destroy_cgroup)
-                       proto->destroy_cgroup(cgrp, ss);
+                       proto->destroy_cgroup(cgrp);
         mutex_unlock(&proto_list_mutex);
         return ret;
   }
   
- void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
   {
         struct proto *proto;
   
         mutex_lock(&proto_list_mutex);
         list_for_each_entry_reverse(proto, &proto_list, node)
                 if (proto->destroy_cgroup)
-                       proto->destroy_cgroup(cgrp, ss);
+                       proto->destroy_cgroup(cgrp);
         mutex_unlock(&proto_list_mutex);
   }
   #endif
@@@ -184,7 -184,7 +184,7 @@@
   static struct lock_class_key af_family_keys[AF_MAX];
   static struct lock_class_key af_family_slock_keys[AF_MAX];
   
- -struct jump_label_key memcg_socket_limit_enabled;
+ +struct static_key memcg_socket_limit_enabled;
   EXPORT_SYMBOL(memcg_socket_limit_enabled);
   
   /*
@@@ -1171,10 -1171,13 +1171,10 @@@ EXPORT_SYMBOL(sock_update_classid)
   
   void sock_update_netprioidx(struct sock *sk)
   {
- -      struct cgroup_netprio_state *state;
         if (in_interrupt())
                 return;
- -      rcu_read_lock();
- -      state = task_netprio_state(current);
- -      sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
- -      rcu_read_unlock();
+ +
+ +      sk->sk_cgrp_prioidx = task_netprioidx(current);
   }
   EXPORT_SYMBOL_GPL(sock_update_netprioidx);
   #endif
@@@ -1824,7 -1827,7 +1824,7 @@@ suppress_allocation
         /* Alas. Undo changes. */
         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
   
- -      sk_memory_allocated_sub(sk, amt, parent_status);
+ +      sk_memory_allocated_sub(sk, amt);
   
         return 0;
   }
@@@ -1837,7 -1840,7 +1837,7 @@@ EXPORT_SYMBOL(__sk_mem_schedule)
   void __sk_mem_reclaim(struct sock *sk)
   {
         sk_memory_allocated_sub(sk,
- -                              sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
+ +                              sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
   
         if (sk_under_memory_pressure(sk) &&
diff --combined net/ipv4/tcp_memcontrol.c

index 602fb305365fd5d15794009ccbf3ece2739a965a,e714c6834c90faba23e55a884ddf2b411bdcad06..e795272fbe9ed5683f32e33dd7a6767f9ca3837c
--- 1/net/ipv4/tcp_memcontrol.c
--- 2/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@@ -94,7 -94,7 +94,7 @@@ create_files
   }
   EXPORT_SYMBOL(tcp_init_cgroup);
   
- void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+ void tcp_destroy_cgroup(struct cgroup *cgrp)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
         struct cg_proto *cg_proto;
@@@ -111,7 -111,7 +111,7 @@@
         val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
   
         if (val != RESOURCE_MAX)
- -              jump_label_dec(&memcg_socket_limit_enabled);
+ +              static_key_slow_dec(&memcg_socket_limit_enabled);
   }
   EXPORT_SYMBOL(tcp_destroy_cgroup);
   
@@@ -143,9 -143,9 +143,9 @@@ static int tcp_update_limit(struct mem_
                                              net->ipv4.sysctl_tcp_mem[i]);
   
         if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
- -              jump_label_dec(&memcg_socket_limit_enabled);
+ +              static_key_slow_dec(&memcg_socket_limit_enabled);
         else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
- -              jump_label_inc(&memcg_socket_limit_enabled);
+ +              static_key_slow_inc(&memcg_socket_limit_enabled);
   
         return 0;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 21 Mar 2012 01:11:21 +0000 (18:11 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 21 Mar 2012 01:11:21 +0000 (18:11 -0700)
		1	2
block/blk-cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/sock.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/netprio_cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/sock.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/tcp_memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history