perf/core: Fix use-after-free in perf_release()

[mirror_ubuntu-zesty-kernel.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 110b38a58493ee4ba4c19763d2678dae8815e1af..dc7ae610af94fbc3b8af4fa08dbd6124d2533a4b 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -389,8 +389,13 @@ static struct srcu_struct pmus_srcu;
   *   0 - disallow raw tracepoint access for unpriv
   *   1 - disallow cpu events for unpriv
   *   2 - disallow kernel profiling for unpriv
+ *   3 - disallow all unpriv perf event use
   */
-int sysctl_perf_event_paranoid __read_mostly = 2;
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
+int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
  
  /* Minimum for 512 kiB + 1 user control page */
  int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -1469,7 +1474,6 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
  static void
  list_add_event(struct perf_event *event, struct perf_event_context *ctx)
  {
-
         lockdep_assert_held(&ctx->lock);
  
         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1624,6 +1628,8 @@ static void perf_group_attach(struct perf_event *event)
  {
         struct perf_event *group_leader = event->group_leader, *pos;
  
+       lockdep_assert_held(&event->ctx->lock);
+
         /*
          * We can have double attach due to group movement in perf_event_open.
          */
@@ -1697,6 +1703,8 @@ static void perf_group_detach(struct perf_event *event)
         struct perf_event *sibling, *tmp;
         struct list_head *list = NULL;
  
+       lockdep_assert_held(&event->ctx->lock);
+
         /*
          * We can have double detach due to exit/hot-unplug + close.
          */
@@ -1895,9 +1903,29 @@ __perf_remove_from_context(struct perf_event *event,
   */
  static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
  {
-       lockdep_assert_held(&event->ctx->mutex);
+       struct perf_event_context *ctx = event->ctx;
+
+       lockdep_assert_held(&ctx->mutex);
  
         event_function_call(event, __perf_remove_from_context, (void *)flags);
+
+       /*
+        * The above event_function_call() can NO-OP when it hits
+        * TASK_TOMBSTONE. In that case we must already have been detached
+        * from the context (by perf_event_exit_event()) but the grouping
+        * might still be in-tact.
+        */
+       WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+       if ((flags & DETACH_GROUP) &&
+           (event->attach_state & PERF_ATTACH_GROUP)) {
+               /*
+                * Since in that case we cannot possibly be scheduled, simply
+                * detach now.
+                */
+               raw_spin_lock_irq(&ctx->lock);
+               perf_group_detach(event);
+               raw_spin_unlock_irq(&ctx->lock);
+       }
  }
  
  /*
@@ -3464,14 +3492,15 @@ struct perf_read_data {
         int ret;
  };
  
-static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
  {
-       int event_cpu = event->oncpu;
         u16 local_pkg, event_pkg;
  
         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
-               event_pkg =  topology_physical_package_id(event_cpu);
-               local_pkg =  topology_physical_package_id(local_cpu);
+               int local_cpu = smp_processor_id();
+
+               event_pkg = topology_physical_package_id(event_cpu);
+               local_pkg = topology_physical_package_id(local_cpu);
  
                 if (event_pkg == local_pkg)
                         return local_cpu;
@@ -3601,7 +3630,7 @@ u64 perf_event_read_local(struct perf_event *event)
  
  static int perf_event_read(struct perf_event *event, bool group)
  {
-       int ret = 0, cpu_to_read, local_cpu;
+       int event_cpu, ret = 0;
  
         /*
          * If event is enabled and currently active on a CPU, update the
@@ -3614,21 +3643,25 @@ static int perf_event_read(struct perf_event *event, bool group)
                         .ret = 0,
                 };
  
-               local_cpu = get_cpu();
-               cpu_to_read = find_cpu_to_read(event, local_cpu);
-               put_cpu();
+               event_cpu = READ_ONCE(event->oncpu);
+               if ((unsigned)event_cpu >= nr_cpu_ids)
+                       return 0;
+
+               preempt_disable();
+               event_cpu = __perf_event_read_cpu(event, event_cpu);
  
                 /*
                  * Purposely ignore the smp_call_function_single() return
                  * value.
                  *
-                * If event->oncpu isn't a valid CPU it means the event got
+                * If event_cpu isn't a valid CPU it means the event got
                  * scheduled out and that will have updated the event count.
                  *
                  * Therefore, either way, we'll have an up-to-date event count
                  * after this.
                  */
-               (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
+               (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
+               preempt_enable();
                 ret = data.ret;
         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                 struct perf_event_context *ctx = event->ctx;
@@ -6609,6 +6642,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
         char *buf = NULL;
         char *name;
  
+       if (vma->vm_flags & VM_READ)
+               prot |= PROT_READ;
+       if (vma->vm_flags & VM_WRITE)
+               prot |= PROT_WRITE;
+       if (vma->vm_flags & VM_EXEC)
+               prot |= PROT_EXEC;
+
+       if (vma->vm_flags & VM_MAYSHARE)
+               flags = MAP_SHARED;
+       else
+               flags = MAP_PRIVATE;
+
+       if (vma->vm_flags & VM_DENYWRITE)
+               flags |= MAP_DENYWRITE;
+       if (vma->vm_flags & VM_MAYEXEC)
+               flags |= MAP_EXECUTABLE;
+       if (vma->vm_flags & VM_LOCKED)
+               flags |= MAP_LOCKED;
+       if (vma->vm_flags & VM_HUGETLB)
+               flags |= MAP_HUGETLB;
+
         if (file) {
                 struct inode *inode;
                 dev_t dev;
@@ -6635,27 +6689,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                 maj = MAJOR(dev);
                 min = MINOR(dev);
  
-               if (vma->vm_flags & VM_READ)
-                       prot |= PROT_READ;
-               if (vma->vm_flags & VM_WRITE)
-                       prot |= PROT_WRITE;
-               if (vma->vm_flags & VM_EXEC)
-                       prot |= PROT_EXEC;
-
-               if (vma->vm_flags & VM_MAYSHARE)
-                       flags = MAP_SHARED;
-               else
-                       flags = MAP_PRIVATE;
-
-               if (vma->vm_flags & VM_DENYWRITE)
-                       flags |= MAP_DENYWRITE;
-               if (vma->vm_flags & VM_MAYEXEC)
-                       flags |= MAP_EXECUTABLE;
-               if (vma->vm_flags & VM_LOCKED)
-                       flags |= MAP_LOCKED;
-               if (vma->vm_flags & VM_HUGETLB)
-                       flags |= MAP_HUGETLB;
-
                 goto got_name;
         } else {
                 if (vma->vm_ops && vma->vm_ops->name) {
@@ -9605,6 +9638,9 @@ SYSCALL_DEFINE5(perf_event_open,
         if (flags & ~PERF_FLAG_ALL)
                 return -EINVAL;
  
+       if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
         err = perf_copy_attr(attr_uptr, &attr);
         if (err)
                 return err;
@@ -10346,6 +10382,17 @@ void perf_event_free_task(struct task_struct *task)
                         continue;
  
                 mutex_lock(&ctx->mutex);
+               raw_spin_lock_irq(&ctx->lock);
+               /*
+                * Destroy the task <-> ctx relation and mark the context dead.
+                *
+                * This is important because even though the task hasn't been
+                * exposed yet the context has been (through child_list).
+                */
+               RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
+               WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+               put_task_struct(task); /* cannot be last */
+               raw_spin_unlock_irq(&ctx->lock);
  again:
                 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
                                 group_entry)