locking/spinlock/debug: Fix various data races

[mirror_ubuntu-bionic-kernel.git] / kernel / futex.c
diff --git a/kernel/futex.c b/kernel/futex.c

index 57d0b3657e16b90268fa3396668bb62e6e54d287..f984a818fcb595536cf9f965650afd2721216b53 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -44,6 +44,7 @@
   *  along with this program; if not, write to the Free Software
   *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   */
+#include <linux/compat.h>
  #include <linux/slab.h>
  #include <linux/poll.h>
  #include <linux/fs.h>
@@ -173,8 +174,10 @@
   * double_lock_hb() and double_unlock_hb(), respectively.
   */
  
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int  __read_mostly futex_cmpxchg_enabled;
  #endif
  
  /*
@@ -338,6 +341,12 @@ static inline bool should_fail_futex(bool fshared)
  }
  #endif /* CONFIG_FAIL_FUTEX */
  
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
  static inline void futex_get_mm(union futex_key *key)
  {
         mmgrab(key->private.mm);
@@ -887,7 +896,7 @@ static struct task_struct *futex_find_get_task(pid_t pid)
   * Kernel cleans up PI-state, but userspace is likely hosed.
   * (Robust-futex cleanup is separate and might save the day for userspace.)
   */
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
  {
         struct list_head *next, *head = &curr->pi_state_list;
         struct futex_pi_state *pi_state;
@@ -957,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr)
         }
         raw_spin_unlock_irq(&curr->pi_lock);
  }
-
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
  #endif
  
  /*
@@ -1166,12 +1176,99 @@ out_error:
         return ret;
  }
  
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @exiting:   Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+       if (ret != -EBUSY) {
+               WARN_ON_ONCE(exiting);
+               return;
+       }
+
+       if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+               return;
+
+       mutex_lock(&exiting->futex_exit_mutex);
+       /*
+        * No point in doing state checking here. If the waiter got here
+        * while the task was in exec()->exec_futex_release() then it can
+        * have any FUTEX_STATE_* value when the waiter has acquired the
+        * mutex. OK, if running, EXITING or DEAD if it reached exit()
+        * already. Highly unlikely and not a problem. Just one more round
+        * through the futex maze.
+        */
+       mutex_unlock(&exiting->futex_exit_mutex);
+
+       put_task_struct(exiting);
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+                           struct task_struct *tsk)
+{
+       u32 uval2;
+
+       /*
+        * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+        * caller that the alleged owner is busy.
+        */
+       if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+               return -EBUSY;
+
+       /*
+        * Reread the user space value to handle the following situation:
+        *
+        * CPU0                         CPU1
+        *
+        * sys_exit()                   sys_futex()
+        *  do_exit()                    futex_lock_pi()
+        *                                futex_lock_pi_atomic()
+        *   exit_signals(tsk)              No waiters:
+        *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
+        *  mm_release(tsk)                 Set waiter bit
+        *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
+        *      Set owner died              attach_to_pi_owner() {
+        *    *uaddr = 0xC0000000;           tsk = get_task(PID);
+        *   }                               if (!tsk->flags & PF_EXITING) {
+        *  ...                                attach();
+        *  tsk->futex_state =               } else {
+        *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
+        *                                        FUTEX_STATE_DEAD)
+        *                                       return -EAGAIN;
+        *                                     return -ESRCH; <--- FAIL
+        *                                   }
+        *
+        * Returning ESRCH unconditionally is wrong here because the
+        * user space value has been changed by the exiting task.
+        *
+        * The same logic applies to the case where the exiting task is
+        * already gone.
+        */
+       if (get_futex_value_locked(&uval2, uaddr))
+               return -EFAULT;
+
+       /* If the user space value has changed, try again. */
+       if (uval2 != uval)
+               return -EAGAIN;
+
+       /*
+        * The exiting task did not have a robust list, the robust list was
+        * corrupted or the user space value in *uaddr is simply bogus.
+        * Give up and tell user space.
+        */
+       return -ESRCH;
+}
+
  /*
   * Lookup the task for the TID provided from user space and attach to
   * it after doing proper sanity checks.
   */
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
-                             struct futex_pi_state **ps)
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+                             struct futex_pi_state **ps,
+                             struct task_struct **exiting)
  {
         pid_t pid = uval & FUTEX_TID_MASK;
         struct futex_pi_state *pi_state;
@@ -1180,12 +1277,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
         /*
          * We are the first waiter - try to look up the real owner and attach
          * the new pi_state to it, but bail out when TID = 0 [1]
+        *
+        * The !pid check is paranoid. None of the call sites should end up
+        * with pid == 0, but better safe than sorry. Let the caller retry
          */
         if (!pid)
-               return -ESRCH;
+               return -EAGAIN;
         p = futex_find_get_task(pid);
         if (!p)
-               return -ESRCH;
+               return handle_exit_race(uaddr, uval, NULL);
  
         if (unlikely(p->flags & PF_KTHREAD)) {
                 put_task_struct(p);
@@ -1193,22 +1293,33 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
         }
  
         /*
-        * We need to look at the task state flags to figure out,
-        * whether the task is exiting. To protect against the do_exit
-        * change of the task flags, we do this protected by
-        * p->pi_lock:
+        * We need to look at the task state to figure out, whether the
+        * task is exiting. To protect against the change of the task state
+        * in futex_exit_release(), we do this protected by p->pi_lock:
          */
         raw_spin_lock_irq(&p->pi_lock);
-       if (unlikely(p->flags & PF_EXITING)) {
+       if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
                 /*
-                * The task is on the way out. When PF_EXITPIDONE is
-                * set, we know that the task has finished the
-                * cleanup:
+                * The task is on the way out. When the futex state is
+                * FUTEX_STATE_DEAD, we know that the task has finished
+                * the cleanup:
                  */
-               int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+               int ret = handle_exit_race(uaddr, uval, p);
  
                 raw_spin_unlock_irq(&p->pi_lock);
-               put_task_struct(p);
+               /*
+                * If the owner task is between FUTEX_STATE_EXITING and
+                * FUTEX_STATE_DEAD then store the task pointer and keep
+                * the reference on the task struct. The calling code will
+                * drop all locks, wait for the task to reach
+                * FUTEX_STATE_DEAD and then drop the refcount. This is
+                * required to prevent a live lock when the current task
+                * preempted the exiting task between the two states.
+                */
+               if (ret == -EBUSY)
+                       *exiting = p;
+               else
+                       put_task_struct(p);
                 return ret;
         }
  
@@ -1247,7 +1358,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
  
  static int lookup_pi_state(u32 __user *uaddr, u32 uval,
                            struct futex_hash_bucket *hb,
-                          union futex_key *key, struct futex_pi_state **ps)
+                          union futex_key *key, struct futex_pi_state **ps,
+                          struct task_struct **exiting)
  {
         struct futex_q *top_waiter = futex_top_waiter(hb, key);
  
@@ -1262,18 +1374,20 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
          * We are the first waiter - try to look up the owner based on
          * @uval and attach to it.
          */
-       return attach_to_pi_owner(uval, key, ps);
+       return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
  }
  
  static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  {
+       int err;
         u32 uninitialized_var(curval);
  
         if (unlikely(should_fail_futex(true)))
                 return -EFAULT;
  
-       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
-               return -EFAULT;
+       err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+       if (unlikely(err))
+               return err;
  
         /* If user space value changed, let the caller retry */
         return curval != uval ? -EAGAIN : 0;
@@ -1288,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
   *                     lookup
   * @task:              the task to perform the atomic lock work for.  This will
   *                     be "current" except in the case of requeue pi.
+ * @exiting:           Pointer to store the task pointer of the owner task
+ *                     which is in the middle of exiting
   * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
   *
   * Return:
@@ -1296,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
   *  - <0 - error
   *
   * The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
   */
  static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                 union futex_key *key,
                                 struct futex_pi_state **ps,
-                               struct task_struct *task, int set_waiters)
+                               struct task_struct *task,
+                               struct task_struct **exiting,
+                               int set_waiters)
  {
         u32 uval, newval, vpid = task_pid_vnr(task);
         struct futex_q *top_waiter;
@@ -1370,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
          * attach to the owner. If that fails, no harm done, we only
          * set the FUTEX_WAITERS bit in the user space variable.
          */
-       return attach_to_pi_owner(uval, key, ps);
+       return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
  }
  
  /**
@@ -1405,11 +1527,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
                 return;
  
-       /*
-        * Queue the task for later wakeup for after we've released
-        * the hb->lock. wake_q_add() grabs reference to p.
-        */
-       wake_q_add(wake_q, p);
+       get_task_struct(p);
         __unqueue_futex(q);
         /*
          * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -1419,6 +1537,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
          * plist_del in __unqueue_futex().
          */
         smp_store_release(&q->lock_ptr, NULL);
+
+       /*
+        * Queue the task for later wakeup for after we've released
+        * the hb->lock. wake_q_add() grabs reference to p.
+        */
+       wake_q_add(wake_q, p);
+       put_task_struct(p);
  }
  
  /*
@@ -1456,10 +1581,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
         if (unlikely(should_fail_futex(true)))
                 ret = -EFAULT;
  
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
-               ret = -EFAULT;
-
-       } else if (curval != uval) {
+       ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+       if (!ret && (curval != uval)) {
                 /*
                  * If a unconditional UNLOCK_PI operation (user space did not
                  * try the TID->0 transition) raced with a waiter setting the
@@ -1654,32 +1777,32 @@ retry_private:
         double_lock_hb(hb1, hb2);
         op_ret = futex_atomic_op_inuser(op, uaddr2);
         if (unlikely(op_ret < 0)) {
-
                 double_unlock_hb(hb1, hb2);
  
-#ifndef CONFIG_MMU
-               /*
-                * we don't get EFAULT from MMU faults if we don't have an MMU,
-                * but we might get them from range checking
-                */
-               ret = op_ret;
-               goto out_put_keys;
-#endif
-
-               if (unlikely(op_ret != -EFAULT)) {
+               if (!IS_ENABLED(CONFIG_MMU) ||
+                   unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+                       /*
+                        * we don't get EFAULT from MMU faults if we don't have
+                        * an MMU, but we might get them from range checking
+                        */
                         ret = op_ret;
                         goto out_put_keys;
                 }
  
-               ret = fault_in_user_writeable(uaddr2);
-               if (ret)
-                       goto out_put_keys;
+               if (op_ret == -EFAULT) {
+                       ret = fault_in_user_writeable(uaddr2);
+                       if (ret)
+                               goto out_put_keys;
+               }
  
-               if (!(flags & FLAGS_SHARED))
+               if (!(flags & FLAGS_SHARED)) {
+                       cond_resched();
                         goto retry_private;
+               }
  
                 put_futex_key(&key2);
                 put_futex_key(&key1);
+               cond_resched();
                 goto retry;
         }
  
@@ -1788,6 +1911,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
   * @key1:              the from futex key
   * @key2:              the to futex key
   * @ps:                        address to store the pi_state pointer
+ * @exiting:           Pointer to store the task pointer of the owner task
+ *                     which is in the middle of exiting
   * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
   *
   * Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1795,16 +1920,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
   * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
   * hb1 and hb2 must be held by the caller.
   *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
   * Return:
   *  -  0 - failed to acquire the lock atomically;
   *  - >0 - acquired the lock, return value is vpid of the top_waiter
   *  - <0 - error
   */
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
-                                struct futex_hash_bucket *hb1,
-                                struct futex_hash_bucket *hb2,
-                                union futex_key *key1, union futex_key *key2,
-                                struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+                          struct futex_hash_bucket *hb2, union futex_key *key1,
+                          union futex_key *key2, struct futex_pi_state **ps,
+                          struct task_struct **exiting, int set_waiters)
  {
         struct futex_q *top_waiter = NULL;
         u32 curval;
@@ -1841,7 +1970,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
          */
         vpid = task_pid_vnr(top_waiter->task);
         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
-                                  set_waiters);
+                                  exiting, set_waiters);
         if (ret == 1) {
                 requeue_pi_wake_futex(top_waiter, key2, hb2);
                 return vpid;
@@ -1878,6 +2007,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
         struct futex_q *this, *next;
         DEFINE_WAKE_Q(wake_q);
  
+       if (nr_wake < 0 || nr_requeue < 0)
+               return -EINVAL;
+
         /*
          * When PI not supported: return -ENOSYS if requeue_pi is true,
          * consequently the compiler knows requeue_pi is always false past
@@ -1967,6 +2099,8 @@ retry_private:
         }
  
         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+               struct task_struct *exiting = NULL;
+
                 /*
                  * Attempt to acquire uaddr2 and wake the top waiter. If we
                  * intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -1974,7 +2108,8 @@ retry_private:
                  * faults rather in the requeue loop below.
                  */
                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-                                                &key2, &pi_state, nr_requeue);
+                                                &key2, &pi_state,
+                                                &exiting, nr_requeue);
  
                 /*
                  * At this point the top_waiter has either taken uaddr2 or is
@@ -2001,7 +2136,8 @@ retry_private:
                          * If that call succeeds then we have pi_state and an
                          * initial refcount on it.
                          */
-                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+                                             &pi_state, &exiting);
                 }
  
                 switch (ret) {
@@ -2019,17 +2155,24 @@ retry_private:
                         if (!ret)
                                 goto retry;
                         goto out;
+               case -EBUSY:
                 case -EAGAIN:
                         /*
                          * Two reasons for this:
-                        * - Owner is exiting and we just wait for the
+                        * - EBUSY: Owner is exiting and we just wait for the
                          *   exit to complete.
-                        * - The user space value changed.
+                        * - EAGAIN: The user space value changed.
                          */
                         double_unlock_hb(hb1, hb2);
                         hb_waiters_dec(hb2);
                         put_futex_key(&key2);
                         put_futex_key(&key1);
+                       /*
+                        * Handle the case where the owner is in the middle of
+                        * exiting. Wait for the exit to complete otherwise
+                        * this task might loop forever, aka. live lock.
+                        */
+                       wait_for_owner_exiting(ret, exiting);
                         cond_resched();
                         goto retry;
                 default:
@@ -2294,34 +2437,33 @@ static void unqueue_me_pi(struct futex_q *q)
         spin_unlock(q->lock_ptr);
  }
  
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
  static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner)
+                               struct task_struct *argowner)
  {
-       u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
         struct futex_pi_state *pi_state = q->pi_state;
         u32 uval, uninitialized_var(curval), newval;
-       struct task_struct *oldowner;
-       int ret;
+       struct task_struct *oldowner, *newowner;
+       u32 newtid;
+       int ret, err = 0;
+
+       lockdep_assert_held(q->lock_ptr);
  
         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  
         oldowner = pi_state->owner;
-       /* Owner died? */
-       if (!pi_state->owner)
-               newtid |= FUTEX_OWNER_DIED;
  
         /*
-        * We are here either because we stole the rtmutex from the
-        * previous highest priority waiter or we are the highest priority
-        * waiter but have failed to get the rtmutex the first time.
+        * We are here because either:
+        *
+        *  - we stole the lock and pi_state->owner needs updating to reflect
+        *    that (@argowner == current),
          *
-        * We have to replace the newowner TID in the user space variable.
+        * or:
+        *
+        *  - someone stole our lock and we need to fix things to point to the
+        *    new owner (@argowner == NULL).
+        *
+        * Either way, we have to replace the TID in the user space variable.
          * This must be atomic as we have to preserve the owner died bit here.
          *
          * Note: We write the user space value _before_ changing the pi_state
@@ -2334,14 +2476,56 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
          * in the PID check in lookup_pi_state.
          */
  retry:
-       if (get_futex_value_locked(&uval, uaddr))
-               goto handle_fault;
+       if (!argowner) {
+               if (oldowner != current) {
+                       /*
+                        * We raced against a concurrent self; things are
+                        * already fixed up. Nothing to do.
+                        */
+                       ret = 0;
+                       goto out_unlock;
+               }
+
+               if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+                       /* We got the lock after all, nothing to fix. */
+                       ret = 0;
+                       goto out_unlock;
+               }
+
+               /*
+                * Since we just failed the trylock; there must be an owner.
+                */
+               newowner = rt_mutex_owner(&pi_state->pi_mutex);
+               BUG_ON(!newowner);
+       } else {
+               WARN_ON_ONCE(argowner != current);
+               if (oldowner == current) {
+                       /*
+                        * We raced against a concurrent self; things are
+                        * already fixed up. Nothing to do.
+                        */
+                       ret = 0;
+                       goto out_unlock;
+               }
+               newowner = argowner;
+       }
+
+       newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+       /* Owner died? */
+       if (!pi_state->owner)
+               newtid |= FUTEX_OWNER_DIED;
+
+       err = get_futex_value_locked(&uval, uaddr);
+       if (err)
+               goto handle_err;
  
         for (;;) {
                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
  
-               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                       goto handle_fault;
+               err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+               if (err)
+                       goto handle_err;
+
                 if (curval == uval)
                         break;
                 uval = curval;
@@ -2369,23 +2553,37 @@ retry:
         return 0;
  
         /*
-        * To handle the page fault we need to drop the locks here. That gives
-        * the other task (either the highest priority waiter itself or the
-        * task which stole the rtmutex) the chance to try the fixup of the
-        * pi_state. So once we are back from handling the fault we need to
-        * check the pi_state after reacquiring the locks and before trying to
-        * do another fixup. When the fixup has been done already we simply
-        * return.
+        * In order to reschedule or handle a page fault, we need to drop the
+        * locks here. In the case of a fault, this gives the other task
+        * (either the highest priority waiter itself or the task which stole
+        * the rtmutex) the chance to try the fixup of the pi_state. So once we
+        * are back from handling the fault we need to check the pi_state after
+        * reacquiring the locks and before trying to do another fixup. When
+        * the fixup has been done already we simply return.
          *
          * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
          * drop hb->lock since the caller owns the hb -> futex_q relation.
          * Dropping the pi_mutex->wait_lock requires the state revalidate.
          */
-handle_fault:
+handle_err:
         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
         spin_unlock(q->lock_ptr);
  
-       ret = fault_in_user_writeable(uaddr);
+       switch (err) {
+       case -EFAULT:
+               ret = fault_in_user_writeable(uaddr);
+               break;
+
+       case -EAGAIN:
+               cond_resched();
+               ret = 0;
+               break;
+
+       default:
+               WARN_ON_ONCE(1);
+               ret = err;
+               break;
+       }
  
         spin_lock(q->lock_ptr);
         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -2434,15 +2632,28 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                  * Got the lock. We might not be the anticipated owner if we
                  * did a lock-steal - fix up the PI-state in that case:
                  *
-                * We can safely read pi_state->owner without holding wait_lock
-                * because we now own the rt_mutex, only the owner will attempt
-                * to change it.
+                * Speculative pi_state->owner read (we don't hold wait_lock);
+                * since we own the lock pi_state->owner == current is the
+                * stable state, anything else needs more attention.
                  */
                 if (q->pi_state->owner != current)
                         ret = fixup_pi_state_owner(uaddr, q, current);
                 goto out;
         }
  
+       /*
+        * If we didn't get the lock; check if anybody stole it from us. In
+        * that case, we need to fix up the uval to point to them instead of
+        * us, otherwise bad things happen. [10]
+        *
+        * Another speculative read; pi_state->owner == current is unstable
+        * but needs our attention.
+        */
+       if (q->pi_state->owner == current) {
+               ret = fixup_pi_state_owner(uaddr, q, NULL);
+               goto out;
+       }
+
         /*
          * Paranoia check. If we did not take the lock, then we should not be
          * the owner of the rt_mutex.
@@ -2677,6 +2888,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
  {
         struct hrtimer_sleeper timeout, *to = NULL;
         struct futex_pi_state *pi_state = NULL;
+       struct task_struct *exiting = NULL;
         struct rt_mutex_waiter rt_waiter;
         struct futex_hash_bucket *hb;
         struct futex_q q = futex_q_init;
@@ -2704,7 +2916,8 @@ retry:
  retry_private:
         hb = queue_lock(&q);
  
-       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+                                  &exiting, 0);
         if (unlikely(ret)) {
                 /*
                  * Atomic work succeeded and we got the lock,
@@ -2717,15 +2930,22 @@ retry_private:
                         goto out_unlock_put_key;
                 case -EFAULT:
                         goto uaddr_faulted;
+               case -EBUSY:
                 case -EAGAIN:
                         /*
                          * Two reasons for this:
-                        * - Task is exiting and we just wait for the
+                        * - EBUSY: Task is exiting and we just wait for the
                          *   exit to complete.
-                        * - The user space value changed.
+                        * - EAGAIN: The user space value changed.
                          */
                         queue_unlock(hb);
                         put_futex_key(&q.key);
+                       /*
+                        * Handle the case where the owner is in the middle of
+                        * exiting. Wait for the exit to complete otherwise
+                        * this task might loop forever, aka. live lock.
+                        */
+                       wait_for_owner_exiting(ret, exiting);
                         cond_resched();
                         goto retry;
                 default:
@@ -2757,35 +2977,39 @@ retry_private:
          * and BUG when futex_unlock_pi() interleaves with this.
          *
          * Therefore acquire wait_lock while holding hb->lock, but drop the
-        * latter before calling rt_mutex_start_proxy_lock(). This still fully
-        * serializes against futex_unlock_pi() as that does the exact same
-        * lock handoff sequence.
+        * latter before calling __rt_mutex_start_proxy_lock(). This
+        * interleaves with futex_unlock_pi() -- which does a similar lock
+        * handoff -- such that the latter can observe the futex_q::pi_state
+        * before __rt_mutex_start_proxy_lock() is done.
          */
         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
         spin_unlock(q.lock_ptr);
+       /*
+        * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+        * such that futex_unlock_pi() is guaranteed to observe the waiter when
+        * it sees the futex_q::pi_state.
+        */
         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
  
         if (ret) {
                 if (ret == 1)
                         ret = 0;
-
-               spin_lock(q.lock_ptr);
-               goto no_block;
+               goto cleanup;
         }
  
-
         if (unlikely(to))
                 hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
  
         ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
  
+cleanup:
         spin_lock(q.lock_ptr);
         /*
-        * If we failed to acquire the lock (signal/timeout), we must
+        * If we failed to acquire the lock (deadlock/signal/timeout), we must
          * first acquire the hb->lock before removing the lock from the
-        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
-        * wait lists consistent.
+        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+        * lists consistent.
          *
          * In particular; it is important that futex_unlock_pi() can not
          * observe this inconsistency.
@@ -2909,6 +3133,10 @@ retry:
                  * there is no point where we hold neither; and therefore
                  * wake_futex_pi() must observe a state consistent with what we
                  * observed.
+                *
+                * In particular; this forces __rt_mutex_start_proxy() to
+                * complete such that we're guaranteed to observe the
+                * rt_waiter. Also see the WARN in wake_futex_pi().
                  */
                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
                 spin_unlock(&hb->lock);
@@ -2933,10 +3161,8 @@ retry:
                  * A unconditional UNLOCK_PI op raced against a waiter
                  * setting the FUTEX_WAITERS bit. Try again.
                  */
-               if (ret == -EAGAIN) {
-                       put_futex_key(&key);
-                       goto retry;
-               }
+               if (ret == -EAGAIN)
+                       goto pi_retry;
                 /*
                  * wake_futex_pi has detected invalid state. Tell user
                  * space.
@@ -2951,9 +3177,19 @@ retry:
          * preserve the WAITERS bit not the OWNER_DIED one. We are the
          * owner.
          */
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+       if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
                 spin_unlock(&hb->lock);
-               goto pi_faulted;
+               switch (ret) {
+               case -EFAULT:
+                       goto pi_faulted;
+
+               case -EAGAIN:
+                       goto pi_retry;
+
+               default:
+                       WARN_ON_ONCE(1);
+                       goto out_putkey;
+               }
         }
  
         /*
@@ -2967,6 +3203,11 @@ out_putkey:
         put_futex_key(&key);
         return ret;
  
+pi_retry:
+       put_futex_key(&key);
+       cond_resched();
+       goto retry;
+
  pi_faulted:
         put_futex_key(&key);
  
@@ -3320,54 +3561,115 @@ err_unlock:
         return ret;
  }
  
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING   true
+#define HANDLE_DEATH_LIST      false
+
  /*
   * Process a futex-list entry, check whether it's owned by the
   * dying task, and do notification if so:
   */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+                             bool pi, bool pending_op)
  {
         u32 uval, uninitialized_var(nval), mval;
+       int err;
+
+       /* Futex address must be 32bit aligned */
+       if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+               return -1;
  
  retry:
         if (get_user(uval, uaddr))
                 return -1;
  
-       if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
-               /*
-                * Ok, this dying thread is truly holding a futex
-                * of interest. Set the OWNER_DIED bit atomically
-                * via cmpxchg, and if the value had FUTEX_WAITERS
-                * set, wake up a waiter (if any). (We have to do a
-                * futex_wake() even if OWNER_DIED is already set -
-                * to handle the rare but possible case of recursive
-                * thread-death.) The rest of the cleanup is done in
-                * userspace.
-                */
-               mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-               /*
-                * We are not holding a lock here, but we want to have
-                * the pagefault_disable/enable() protection because
-                * we want to handle the fault gracefully. If the
-                * access fails we try to fault in the futex with R/W
-                * verification via get_user_pages. get_user() above
-                * does not guarantee R/W access. If that fails we
-                * give up and leave the futex locked.
-                */
-               if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+       /*
+        * Special case for regular (non PI) futexes. The unlock path in
+        * user space has two race scenarios:
+        *
+        * 1. The unlock path releases the user space futex value and
+        *    before it can execute the futex() syscall to wake up
+        *    waiters it is killed.
+        *
+        * 2. A woken up waiter is killed before it can acquire the
+        *    futex in user space.
+        *
+        * In both cases the TID validation below prevents a wakeup of
+        * potential waiters which can cause these waiters to block
+        * forever.
+        *
+        * In both cases the following conditions are met:
+        *
+        *      1) task->robust_list->list_op_pending != NULL
+        *         @pending_op == true
+        *      2) User space futex value == 0
+        *      3) Regular futex: @pi == false
+        *
+        * If these conditions are met, it is safe to attempt waking up a
+        * potential waiter without touching the user space futex value and
+        * trying to set the OWNER_DIED bit. The user space futex value is
+        * uncontended and the rest of the user space mutex state is
+        * consistent, so a woken waiter will just take over the
+        * uncontended futex. Setting the OWNER_DIED bit would create
+        * inconsistent state and malfunction of the user space owner died
+        * handling.
+        */
+       if (pending_op && !pi && !uval) {
+               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+               return 0;
+       }
+
+       if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+               return 0;
+
+       /*
+        * Ok, this dying thread is truly holding a futex
+        * of interest. Set the OWNER_DIED bit atomically
+        * via cmpxchg, and if the value had FUTEX_WAITERS
+        * set, wake up a waiter (if any). (We have to do a
+        * futex_wake() even if OWNER_DIED is already set -
+        * to handle the rare but possible case of recursive
+        * thread-death.) The rest of the cleanup is done in
+        * userspace.
+        */
+       mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+       /*
+        * We are not holding a lock here, but we want to have
+        * the pagefault_disable/enable() protection because
+        * we want to handle the fault gracefully. If the
+        * access fails we try to fault in the futex with R/W
+        * verification via get_user_pages. get_user() above
+        * does not guarantee R/W access. If that fails we
+        * give up and leave the futex locked.
+        */
+       if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+               switch (err) {
+               case -EFAULT:
                         if (fault_in_user_writeable(uaddr))
                                 return -1;
                         goto retry;
-               }
-               if (nval != uval)
+
+               case -EAGAIN:
+                       cond_resched();
                         goto retry;
  
-               /*
-                * Wake robust non-PI futexes here. The wakeup of
-                * PI futexes happens in exit_pi_state():
-                */
-               if (!pi && (uval & FUTEX_WAITERS))
-                       futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+               default:
+                       WARN_ON_ONCE(1);
+                       return err;
+               }
         }
+
+       if (nval != uval)
+               goto retry;
+
+       /*
+        * Wake robust non-PI futexes here. The wakeup of
+        * PI futexes happens in exit_pi_state():
+        */
+       if (!pi && (uval & FUTEX_WAITERS))
+               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
         return 0;
  }
  
@@ -3395,7 +3697,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
   *
   * We silently return on any sign of list-walking problem.
   */
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
  {
         struct robust_list_head __user *head = curr->robust_list;
         struct robust_list __user *entry, *next_entry, *pending;
@@ -3436,10 +3738,11 @@ void exit_robust_list(struct task_struct *curr)
                  * A pending lock might already be on the list, so
                  * don't process it twice:
                  */
-               if (entry != pending)
+               if (entry != pending) {
                         if (handle_futex_death((void __user *)entry + futex_offset,
-                                               curr, pi))
+                                               curr, pi, HANDLE_DEATH_LIST))
                                 return;
+               }
                 if (rc)
                         return;
                 entry = next_entry;
@@ -3453,9 +3756,118 @@ void exit_robust_list(struct task_struct *curr)
                 cond_resched();
         }
  
-       if (pending)
+       if (pending) {
                 handle_futex_death((void __user *)pending + futex_offset,
-                                  curr, pip);
+                                  curr, pip, HANDLE_DEATH_PENDING);
+       }
+}
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+       if (unlikely(tsk->robust_list)) {
+               exit_robust_list(tsk);
+               tsk->robust_list = NULL;
+       }
+
+#ifdef CONFIG_COMPAT
+       if (unlikely(tsk->compat_robust_list)) {
+               compat_exit_robust_list(tsk);
+               tsk->compat_robust_list = NULL;
+       }
+#endif
+
+       if (unlikely(!list_empty(&tsk->pi_state_list)))
+               exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk:       task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+       /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+       if (tsk->futex_state == FUTEX_STATE_EXITING)
+               mutex_unlock(&tsk->futex_exit_mutex);
+       tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+       /*
+        * Prevent various race issues against a concurrent incoming waiter
+        * including live locks by forcing the waiter to block on
+        * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+        * attach_to_pi_owner().
+        */
+       mutex_lock(&tsk->futex_exit_mutex);
+
+       /*
+        * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+        *
+        * This ensures that all subsequent checks of tsk->futex_state in
+        * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+        * tsk->pi_lock held.
+        *
+        * It guarantees also that a pi_state which was queued right before
+        * the state change under tsk->pi_lock by a concurrent waiter must
+        * be observed in exit_pi_state_list().
+        */
+       raw_spin_lock_irq(&tsk->pi_lock);
+       tsk->futex_state = FUTEX_STATE_EXITING;
+       raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+       /*
+        * Lockless store. The only side effect is that an observer might
+        * take another loop until it becomes visible.
+        */
+       tsk->futex_state = state;
+       /*
+        * Drop the exit protection. This unblocks waiters which observed
+        * FUTEX_STATE_EXITING to reevaluate the state.
+        */
+       mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+       /*
+        * The state handling is done for consistency, but in the case of
+        * exec() there is no way to prevent futher damage as the PID stays
+        * the same. But for the unlikely and arguably buggy case that a
+        * futex is held on exec(), this provides at least as much state
+        * consistency protection which is possible.
+        */
+       futex_cleanup_begin(tsk);
+       futex_cleanup(tsk);
+       /*
+        * Reset the state to FUTEX_STATE_OK. The task is alive and about
+        * exec a new binary.
+        */
+       futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+       futex_cleanup_begin(tsk);
+       futex_cleanup(tsk);
+       futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
  }
  
  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3551,6 +3963,193 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
  }
  
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+                  compat_uptr_t __user *head, unsigned int *pi)
+{
+       if (get_user(*uentry, head))
+               return -EFAULT;
+
+       *entry = compat_ptr((*uentry) & ~1);
+       *pi = (unsigned int)(*uentry) & 1;
+
+       return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+                               compat_long_t futex_offset)
+{
+       compat_uptr_t base = ptr_to_compat(entry);
+       void __user *uaddr = compat_ptr(base + futex_offset);
+
+       return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void compat_exit_robust_list(struct task_struct *curr)
+{
+       struct compat_robust_list_head __user *head = curr->compat_robust_list;
+       struct robust_list __user *entry, *next_entry, *pending;
+       unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+       unsigned int uninitialized_var(next_pi);
+       compat_uptr_t uentry, next_uentry, upending;
+       compat_long_t futex_offset;
+       int rc;
+
+       if (!futex_cmpxchg_enabled)
+               return;
+
+       /*
+        * Fetch the list head (which was registered earlier, via
+        * sys_set_robust_list()):
+        */
+       if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+               return;
+       /*
+        * Fetch the relative futex offset:
+        */
+       if (get_user(futex_offset, &head->futex_offset))
+               return;
+       /*
+        * Fetch any possibly pending lock-add first, and handle it
+        * if it exists:
+        */
+       if (compat_fetch_robust_entry(&upending, &pending,
+                              &head->list_op_pending, &pip))
+               return;
+
+       next_entry = NULL;      /* avoid warning with gcc */
+       while (entry != (struct robust_list __user *) &head->list) {
+               /*
+                * Fetch the next entry in the list before calling
+                * handle_futex_death:
+                */
+               rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+                       (compat_uptr_t __user *)&entry->next, &next_pi);
+               /*
+                * A pending lock might already be on the list, so
+                * dont process it twice:
+                */
+               if (entry != pending) {
+                       void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+                       if (handle_futex_death(uaddr, curr, pi,
+                                              HANDLE_DEATH_LIST))
+                               return;
+               }
+               if (rc)
+                       return;
+               uentry = next_uentry;
+               entry = next_entry;
+               pi = next_pi;
+               /*
+                * Avoid excessively long or circular lists:
+                */
+               if (!--limit)
+                       break;
+
+               cond_resched();
+       }
+       if (pending) {
+               void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+               handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+       }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+               struct compat_robust_list_head __user *, head,
+               compat_size_t, len)
+{
+       if (!futex_cmpxchg_enabled)
+               return -ENOSYS;
+
+       if (unlikely(len != sizeof(*head)))
+               return -EINVAL;
+
+       current->compat_robust_list = head;
+
+       return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+                       compat_uptr_t __user *, head_ptr,
+                       compat_size_t __user *, len_ptr)
+{
+       struct compat_robust_list_head __user *head;
+       unsigned long ret;
+       struct task_struct *p;
+
+       if (!futex_cmpxchg_enabled)
+               return -ENOSYS;
+
+       rcu_read_lock();
+
+       ret = -ESRCH;
+       if (!pid)
+               p = current;
+       else {
+               p = find_task_by_vpid(pid);
+               if (!p)
+                       goto err_unlock;
+       }
+
+       ret = -EPERM;
+       if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+               goto err_unlock;
+
+       head = p->compat_robust_list;
+       rcu_read_unlock();
+
+       if (put_user(sizeof(*head), len_ptr))
+               return -EFAULT;
+       return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+       rcu_read_unlock();
+
+       return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+               struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+               u32, val3)
+{
+       struct timespec ts;
+       ktime_t t, *tp = NULL;
+       int val2 = 0;
+       int cmd = op & FUTEX_CMD_MASK;
+
+       if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+                     cmd == FUTEX_WAIT_BITSET ||
+                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
+               if (compat_get_timespec(&ts, utime))
+                       return -EFAULT;
+               if (!timespec_valid(&ts))
+                       return -EINVAL;
+
+               t = timespec_to_ktime(ts);
+               if (cmd == FUTEX_WAIT)
+                       t = ktime_add_safe(ktime_get(), t);
+               tp = &t;
+       }
+       if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+               val2 = (int) (unsigned long) utime;
+
+       return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT */
+
  static void __init futex_detect_cmpxchg(void)
  {
  #ifndef CONFIG_HAVE_FUTEX_CMPXCHG