]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - kernel/futex.c
locking/spinlock/debug: Fix various data races
[mirror_ubuntu-bionic-kernel.git] / kernel / futex.c
index 57d0b3657e16b90268fa3396668bb62e6e54d287..f984a818fcb595536cf9f965650afd2721216b53 100644 (file)
@@ -44,6 +44,7 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+#include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fs.h>
  * double_lock_hb() and double_unlock_hb(), respectively.
  */
 
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int  __read_mostly futex_cmpxchg_enabled;
 #endif
 
 /*
@@ -338,6 +341,12 @@ static inline bool should_fail_futex(bool fshared)
 }
 #endif /* CONFIG_FAIL_FUTEX */
 
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
 static inline void futex_get_mm(union futex_key *key)
 {
        mmgrab(key->private.mm);
@@ -887,7 +896,7 @@ static struct task_struct *futex_find_get_task(pid_t pid)
  * Kernel cleans up PI-state, but userspace is likely hosed.
  * (Robust-futex cleanup is separate and might save the day for userspace.)
  */
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
 {
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
@@ -957,7 +966,8 @@ void exit_pi_state_list(struct task_struct *curr)
        }
        raw_spin_unlock_irq(&curr->pi_lock);
 }
-
+#else
+static inline void exit_pi_state_list(struct task_struct *curr) { }
 #endif
 
 /*
@@ -1166,12 +1176,99 @@ out_error:
        return ret;
 }
 
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @exiting:   Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+       if (ret != -EBUSY) {
+               WARN_ON_ONCE(exiting);
+               return;
+       }
+
+       if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+               return;
+
+       mutex_lock(&exiting->futex_exit_mutex);
+       /*
+        * No point in doing state checking here. If the waiter got here
+        * while the task was in exec()->exec_futex_release() then it can
+        * have any FUTEX_STATE_* value when the waiter has acquired the
+        * mutex. OK, if running, EXITING or DEAD if it reached exit()
+        * already. Highly unlikely and not a problem. Just one more round
+        * through the futex maze.
+        */
+       mutex_unlock(&exiting->futex_exit_mutex);
+
+       put_task_struct(exiting);
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+                           struct task_struct *tsk)
+{
+       u32 uval2;
+
+       /*
+        * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+        * caller that the alleged owner is busy.
+        */
+       if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+               return -EBUSY;
+
+       /*
+        * Reread the user space value to handle the following situation:
+        *
+        * CPU0                         CPU1
+        *
+        * sys_exit()                   sys_futex()
+        *  do_exit()                    futex_lock_pi()
+        *                                futex_lock_pi_atomic()
+        *   exit_signals(tsk)              No waiters:
+        *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
+        *  mm_release(tsk)                 Set waiter bit
+        *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
+        *      Set owner died              attach_to_pi_owner() {
+        *    *uaddr = 0xC0000000;           tsk = get_task(PID);
+        *   }                               if (!tsk->flags & PF_EXITING) {
+        *  ...                                attach();
+        *  tsk->futex_state =               } else {
+        *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
+        *                                        FUTEX_STATE_DEAD)
+        *                                       return -EAGAIN;
+        *                                     return -ESRCH; <--- FAIL
+        *                                   }
+        *
+        * Returning ESRCH unconditionally is wrong here because the
+        * user space value has been changed by the exiting task.
+        *
+        * The same logic applies to the case where the exiting task is
+        * already gone.
+        */
+       if (get_futex_value_locked(&uval2, uaddr))
+               return -EFAULT;
+
+       /* If the user space value has changed, try again. */
+       if (uval2 != uval)
+               return -EAGAIN;
+
+       /*
+        * The exiting task did not have a robust list, the robust list was
+        * corrupted or the user space value in *uaddr is simply bogus.
+        * Give up and tell user space.
+        */
+       return -ESRCH;
+}
+
 /*
  * Lookup the task for the TID provided from user space and attach to
  * it after doing proper sanity checks.
  */
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
-                             struct futex_pi_state **ps)
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+                             struct futex_pi_state **ps,
+                             struct task_struct **exiting)
 {
        pid_t pid = uval & FUTEX_TID_MASK;
        struct futex_pi_state *pi_state;
@@ -1180,12 +1277,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        /*
         * We are the first waiter - try to look up the real owner and attach
         * the new pi_state to it, but bail out when TID = 0 [1]
+        *
+        * The !pid check is paranoid. None of the call sites should end up
+        * with pid == 0, but better safe than sorry. Let the caller retry
         */
        if (!pid)
-               return -ESRCH;
+               return -EAGAIN;
        p = futex_find_get_task(pid);
        if (!p)
-               return -ESRCH;
+               return handle_exit_race(uaddr, uval, NULL);
 
        if (unlikely(p->flags & PF_KTHREAD)) {
                put_task_struct(p);
@@ -1193,22 +1293,33 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
        }
 
        /*
-        * We need to look at the task state flags to figure out,
-        * whether the task is exiting. To protect against the do_exit
-        * change of the task flags, we do this protected by
-        * p->pi_lock:
+        * We need to look at the task state to figure out, whether the
+        * task is exiting. To protect against the change of the task state
+        * in futex_exit_release(), we do this protected by p->pi_lock:
         */
        raw_spin_lock_irq(&p->pi_lock);
-       if (unlikely(p->flags & PF_EXITING)) {
+       if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
                /*
-                * The task is on the way out. When PF_EXITPIDONE is
-                * set, we know that the task has finished the
-                * cleanup:
+                * The task is on the way out. When the futex state is
+                * FUTEX_STATE_DEAD, we know that the task has finished
+                * the cleanup:
                 */
-               int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+               int ret = handle_exit_race(uaddr, uval, p);
 
                raw_spin_unlock_irq(&p->pi_lock);
-               put_task_struct(p);
+               /*
+                * If the owner task is between FUTEX_STATE_EXITING and
+                * FUTEX_STATE_DEAD then store the task pointer and keep
+                * the reference on the task struct. The calling code will
+                * drop all locks, wait for the task to reach
+                * FUTEX_STATE_DEAD and then drop the refcount. This is
+                * required to prevent a live lock when the current task
+                * preempted the exiting task between the two states.
+                */
+               if (ret == -EBUSY)
+                       *exiting = p;
+               else
+                       put_task_struct(p);
                return ret;
        }
 
@@ -1247,7 +1358,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 
 static int lookup_pi_state(u32 __user *uaddr, u32 uval,
                           struct futex_hash_bucket *hb,
-                          union futex_key *key, struct futex_pi_state **ps)
+                          union futex_key *key, struct futex_pi_state **ps,
+                          struct task_struct **exiting)
 {
        struct futex_q *top_waiter = futex_top_waiter(hb, key);
 
@@ -1262,18 +1374,20 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
         * We are the first waiter - try to look up the owner based on
         * @uval and attach to it.
         */
-       return attach_to_pi_owner(uval, key, ps);
+       return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
 }
 
 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 {
+       int err;
        u32 uninitialized_var(curval);
 
        if (unlikely(should_fail_futex(true)))
                return -EFAULT;
 
-       if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
-               return -EFAULT;
+       err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+       if (unlikely(err))
+               return err;
 
        /* If user space value changed, let the caller retry */
        return curval != uval ? -EAGAIN : 0;
@@ -1288,6 +1402,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  *                     lookup
  * @task:              the task to perform the atomic lock work for.  This will
  *                     be "current" except in the case of requeue pi.
+ * @exiting:           Pointer to store the task pointer of the owner task
+ *                     which is in the middle of exiting
  * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Return:
@@ -1296,11 +1412,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  *  - <0 - error
  *
  * The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
  */
 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                union futex_key *key,
                                struct futex_pi_state **ps,
-                               struct task_struct *task, int set_waiters)
+                               struct task_struct *task,
+                               struct task_struct **exiting,
+                               int set_waiters)
 {
        u32 uval, newval, vpid = task_pid_vnr(task);
        struct futex_q *top_waiter;
@@ -1370,7 +1492,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
         * attach to the owner. If that fails, no harm done, we only
         * set the FUTEX_WAITERS bit in the user space variable.
         */
-       return attach_to_pi_owner(uval, key, ps);
+       return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
 }
 
 /**
@@ -1405,11 +1527,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
                return;
 
-       /*
-        * Queue the task for later wakeup for after we've released
-        * the hb->lock. wake_q_add() grabs reference to p.
-        */
-       wake_q_add(wake_q, p);
+       get_task_struct(p);
        __unqueue_futex(q);
        /*
         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -1419,6 +1537,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
         * plist_del in __unqueue_futex().
         */
        smp_store_release(&q->lock_ptr, NULL);
+
+       /*
+        * Queue the task for later wakeup for after we've released
+        * the hb->lock. wake_q_add() grabs reference to p.
+        */
+       wake_q_add(wake_q, p);
+       put_task_struct(p);
 }
 
 /*
@@ -1456,10 +1581,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_
        if (unlikely(should_fail_futex(true)))
                ret = -EFAULT;
 
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
-               ret = -EFAULT;
-
-       } else if (curval != uval) {
+       ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+       if (!ret && (curval != uval)) {
                /*
                 * If a unconditional UNLOCK_PI operation (user space did not
                 * try the TID->0 transition) raced with a waiter setting the
@@ -1654,32 +1777,32 @@ retry_private:
        double_lock_hb(hb1, hb2);
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
-
                double_unlock_hb(hb1, hb2);
 
-#ifndef CONFIG_MMU
-               /*
-                * we don't get EFAULT from MMU faults if we don't have an MMU,
-                * but we might get them from range checking
-                */
-               ret = op_ret;
-               goto out_put_keys;
-#endif
-
-               if (unlikely(op_ret != -EFAULT)) {
+               if (!IS_ENABLED(CONFIG_MMU) ||
+                   unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+                       /*
+                        * we don't get EFAULT from MMU faults if we don't have
+                        * an MMU, but we might get them from range checking
+                        */
                        ret = op_ret;
                        goto out_put_keys;
                }
 
-               ret = fault_in_user_writeable(uaddr2);
-               if (ret)
-                       goto out_put_keys;
+               if (op_ret == -EFAULT) {
+                       ret = fault_in_user_writeable(uaddr2);
+                       if (ret)
+                               goto out_put_keys;
+               }
 
-               if (!(flags & FLAGS_SHARED))
+               if (!(flags & FLAGS_SHARED)) {
+                       cond_resched();
                        goto retry_private;
+               }
 
                put_futex_key(&key2);
                put_futex_key(&key1);
+               cond_resched();
                goto retry;
        }
 
@@ -1788,6 +1911,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  * @key1:              the from futex key
  * @key2:              the to futex key
  * @ps:                        address to store the pi_state pointer
+ * @exiting:           Pointer to store the task pointer of the owner task
+ *                     which is in the middle of exiting
  * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1795,16 +1920,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
  * hb1 and hb2 must be held by the caller.
  *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
  * Return:
  *  -  0 - failed to acquire the lock atomically;
  *  - >0 - acquired the lock, return value is vpid of the top_waiter
  *  - <0 - error
  */
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
-                                struct futex_hash_bucket *hb1,
-                                struct futex_hash_bucket *hb2,
-                                union futex_key *key1, union futex_key *key2,
-                                struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+                          struct futex_hash_bucket *hb2, union futex_key *key1,
+                          union futex_key *key2, struct futex_pi_state **ps,
+                          struct task_struct **exiting, int set_waiters)
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
@@ -1841,7 +1970,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
         */
        vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
-                                  set_waiters);
+                                  exiting, set_waiters);
        if (ret == 1) {
                requeue_pi_wake_futex(top_waiter, key2, hb2);
                return vpid;
@@ -1878,6 +2007,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_q *this, *next;
        DEFINE_WAKE_Q(wake_q);
 
+       if (nr_wake < 0 || nr_requeue < 0)
+               return -EINVAL;
+
        /*
         * When PI not supported: return -ENOSYS if requeue_pi is true,
         * consequently the compiler knows requeue_pi is always false past
@@ -1967,6 +2099,8 @@ retry_private:
        }
 
        if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+               struct task_struct *exiting = NULL;
+
                /*
                 * Attempt to acquire uaddr2 and wake the top waiter. If we
                 * intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -1974,7 +2108,8 @@ retry_private:
                 * faults rather in the requeue loop below.
                 */
                ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-                                                &key2, &pi_state, nr_requeue);
+                                                &key2, &pi_state,
+                                                &exiting, nr_requeue);
 
                /*
                 * At this point the top_waiter has either taken uaddr2 or is
@@ -2001,7 +2136,8 @@ retry_private:
                         * If that call succeeds then we have pi_state and an
                         * initial refcount on it.
                         */
-                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+                                             &pi_state, &exiting);
                }
 
                switch (ret) {
@@ -2019,17 +2155,24 @@ retry_private:
                        if (!ret)
                                goto retry;
                        goto out;
+               case -EBUSY:
                case -EAGAIN:
                        /*
                         * Two reasons for this:
-                        * - Owner is exiting and we just wait for the
+                        * - EBUSY: Owner is exiting and we just wait for the
                         *   exit to complete.
-                        * - The user space value changed.
+                        * - EAGAIN: The user space value changed.
                         */
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
                        put_futex_key(&key1);
+                       /*
+                        * Handle the case where the owner is in the middle of
+                        * exiting. Wait for the exit to complete otherwise
+                        * this task might loop forever, aka. live lock.
+                        */
+                       wait_for_owner_exiting(ret, exiting);
                        cond_resched();
                        goto retry;
                default:
@@ -2294,34 +2437,33 @@ static void unqueue_me_pi(struct futex_q *q)
        spin_unlock(q->lock_ptr);
 }
 
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner)
+                               struct task_struct *argowner)
 {
-       u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
        u32 uval, uninitialized_var(curval), newval;
-       struct task_struct *oldowner;
-       int ret;
+       struct task_struct *oldowner, *newowner;
+       u32 newtid;
+       int ret, err = 0;
+
+       lockdep_assert_held(q->lock_ptr);
 
        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 
        oldowner = pi_state->owner;
-       /* Owner died? */
-       if (!pi_state->owner)
-               newtid |= FUTEX_OWNER_DIED;
 
        /*
-        * We are here either because we stole the rtmutex from the
-        * previous highest priority waiter or we are the highest priority
-        * waiter but have failed to get the rtmutex the first time.
+        * We are here because either:
+        *
+        *  - we stole the lock and pi_state->owner needs updating to reflect
+        *    that (@argowner == current),
         *
-        * We have to replace the newowner TID in the user space variable.
+        * or:
+        *
+        *  - someone stole our lock and we need to fix things to point to the
+        *    new owner (@argowner == NULL).
+        *
+        * Either way, we have to replace the TID in the user space variable.
         * This must be atomic as we have to preserve the owner died bit here.
         *
         * Note: We write the user space value _before_ changing the pi_state
@@ -2334,14 +2476,56 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
         * in the PID check in lookup_pi_state.
         */
 retry:
-       if (get_futex_value_locked(&uval, uaddr))
-               goto handle_fault;
+       if (!argowner) {
+               if (oldowner != current) {
+                       /*
+                        * We raced against a concurrent self; things are
+                        * already fixed up. Nothing to do.
+                        */
+                       ret = 0;
+                       goto out_unlock;
+               }
+
+               if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+                       /* We got the lock after all, nothing to fix. */
+                       ret = 0;
+                       goto out_unlock;
+               }
+
+               /*
+                * Since we just failed the trylock; there must be an owner.
+                */
+               newowner = rt_mutex_owner(&pi_state->pi_mutex);
+               BUG_ON(!newowner);
+       } else {
+               WARN_ON_ONCE(argowner != current);
+               if (oldowner == current) {
+                       /*
+                        * We raced against a concurrent self; things are
+                        * already fixed up. Nothing to do.
+                        */
+                       ret = 0;
+                       goto out_unlock;
+               }
+               newowner = argowner;
+       }
+
+       newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+       /* Owner died? */
+       if (!pi_state->owner)
+               newtid |= FUTEX_OWNER_DIED;
+
+       err = get_futex_value_locked(&uval, uaddr);
+       if (err)
+               goto handle_err;
 
        for (;;) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
-               if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
-                       goto handle_fault;
+               err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+               if (err)
+                       goto handle_err;
+
                if (curval == uval)
                        break;
                uval = curval;
@@ -2369,23 +2553,37 @@ retry:
        return 0;
 
        /*
-        * To handle the page fault we need to drop the locks here. That gives
-        * the other task (either the highest priority waiter itself or the
-        * task which stole the rtmutex) the chance to try the fixup of the
-        * pi_state. So once we are back from handling the fault we need to
-        * check the pi_state after reacquiring the locks and before trying to
-        * do another fixup. When the fixup has been done already we simply
-        * return.
+        * In order to reschedule or handle a page fault, we need to drop the
+        * locks here. In the case of a fault, this gives the other task
+        * (either the highest priority waiter itself or the task which stole
+        * the rtmutex) the chance to try the fixup of the pi_state. So once we
+        * are back from handling the fault we need to check the pi_state after
+        * reacquiring the locks and before trying to do another fixup. When
+        * the fixup has been done already we simply return.
         *
         * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
         * drop hb->lock since the caller owns the hb -> futex_q relation.
         * Dropping the pi_mutex->wait_lock requires the state revalidate.
         */
-handle_fault:
+handle_err:
        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
        spin_unlock(q->lock_ptr);
 
-       ret = fault_in_user_writeable(uaddr);
+       switch (err) {
+       case -EFAULT:
+               ret = fault_in_user_writeable(uaddr);
+               break;
+
+       case -EAGAIN:
+               cond_resched();
+               ret = 0;
+               break;
+
+       default:
+               WARN_ON_ONCE(1);
+               ret = err;
+               break;
+       }
 
        spin_lock(q->lock_ptr);
        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
@@ -2434,15 +2632,28 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                 * Got the lock. We might not be the anticipated owner if we
                 * did a lock-steal - fix up the PI-state in that case:
                 *
-                * We can safely read pi_state->owner without holding wait_lock
-                * because we now own the rt_mutex, only the owner will attempt
-                * to change it.
+                * Speculative pi_state->owner read (we don't hold wait_lock);
+                * since we own the lock pi_state->owner == current is the
+                * stable state, anything else needs more attention.
                 */
                if (q->pi_state->owner != current)
                        ret = fixup_pi_state_owner(uaddr, q, current);
                goto out;
        }
 
+       /*
+        * If we didn't get the lock; check if anybody stole it from us. In
+        * that case, we need to fix up the uval to point to them instead of
+        * us, otherwise bad things happen. [10]
+        *
+        * Another speculative read; pi_state->owner == current is unstable
+        * but needs our attention.
+        */
+       if (q->pi_state->owner == current) {
+               ret = fixup_pi_state_owner(uaddr, q, NULL);
+               goto out;
+       }
+
        /*
         * Paranoia check. If we did not take the lock, then we should not be
         * the owner of the rt_mutex.
@@ -2677,6 +2888,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct futex_pi_state *pi_state = NULL;
+       struct task_struct *exiting = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        struct futex_q q = futex_q_init;
@@ -2704,7 +2916,8 @@ retry:
 retry_private:
        hb = queue_lock(&q);
 
-       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+                                  &exiting, 0);
        if (unlikely(ret)) {
                /*
                 * Atomic work succeeded and we got the lock,
@@ -2717,15 +2930,22 @@ retry_private:
                        goto out_unlock_put_key;
                case -EFAULT:
                        goto uaddr_faulted;
+               case -EBUSY:
                case -EAGAIN:
                        /*
                         * Two reasons for this:
-                        * - Task is exiting and we just wait for the
+                        * - EBUSY: Task is exiting and we just wait for the
                         *   exit to complete.
-                        * - The user space value changed.
+                        * - EAGAIN: The user space value changed.
                         */
                        queue_unlock(hb);
                        put_futex_key(&q.key);
+                       /*
+                        * Handle the case where the owner is in the middle of
+                        * exiting. Wait for the exit to complete otherwise
+                        * this task might loop forever, aka. live lock.
+                        */
+                       wait_for_owner_exiting(ret, exiting);
                        cond_resched();
                        goto retry;
                default:
@@ -2757,35 +2977,39 @@ retry_private:
         * and BUG when futex_unlock_pi() interleaves with this.
         *
         * Therefore acquire wait_lock while holding hb->lock, but drop the
-        * latter before calling rt_mutex_start_proxy_lock(). This still fully
-        * serializes against futex_unlock_pi() as that does the exact same
-        * lock handoff sequence.
+        * latter before calling __rt_mutex_start_proxy_lock(). This
+        * interleaves with futex_unlock_pi() -- which does a similar lock
+        * handoff -- such that the latter can observe the futex_q::pi_state
+        * before __rt_mutex_start_proxy_lock() is done.
         */
        raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
        spin_unlock(q.lock_ptr);
+       /*
+        * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+        * such that futex_unlock_pi() is guaranteed to observe the waiter when
+        * it sees the futex_q::pi_state.
+        */
        ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
        raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
 
        if (ret) {
                if (ret == 1)
                        ret = 0;
-
-               spin_lock(q.lock_ptr);
-               goto no_block;
+               goto cleanup;
        }
 
-
        if (unlikely(to))
                hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
 
        ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
 
+cleanup:
        spin_lock(q.lock_ptr);
        /*
-        * If we failed to acquire the lock (signal/timeout), we must
+        * If we failed to acquire the lock (deadlock/signal/timeout), we must
         * first acquire the hb->lock before removing the lock from the
-        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
-        * wait lists consistent.
+        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+        * lists consistent.
         *
         * In particular; it is important that futex_unlock_pi() can not
         * observe this inconsistency.
@@ -2909,6 +3133,10 @@ retry:
                 * there is no point where we hold neither; and therefore
                 * wake_futex_pi() must observe a state consistent with what we
                 * observed.
+                *
+                * In particular; this forces __rt_mutex_start_proxy() to
+                * complete such that we're guaranteed to observe the
+                * rt_waiter. Also see the WARN in wake_futex_pi().
                 */
                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
                spin_unlock(&hb->lock);
@@ -2933,10 +3161,8 @@ retry:
                 * A unconditional UNLOCK_PI op raced against a waiter
                 * setting the FUTEX_WAITERS bit. Try again.
                 */
-               if (ret == -EAGAIN) {
-                       put_futex_key(&key);
-                       goto retry;
-               }
+               if (ret == -EAGAIN)
+                       goto pi_retry;
                /*
                 * wake_futex_pi has detected invalid state. Tell user
                 * space.
@@ -2951,9 +3177,19 @@ retry:
         * preserve the WAITERS bit not the OWNER_DIED one. We are the
         * owner.
         */
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+       if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
                spin_unlock(&hb->lock);
-               goto pi_faulted;
+               switch (ret) {
+               case -EFAULT:
+                       goto pi_faulted;
+
+               case -EAGAIN:
+                       goto pi_retry;
+
+               default:
+                       WARN_ON_ONCE(1);
+                       goto out_putkey;
+               }
        }
 
        /*
@@ -2967,6 +3203,11 @@ out_putkey:
        put_futex_key(&key);
        return ret;
 
+pi_retry:
+       put_futex_key(&key);
+       cond_resched();
+       goto retry;
+
 pi_faulted:
        put_futex_key(&key);
 
@@ -3320,54 +3561,115 @@ err_unlock:
        return ret;
 }
 
+/* Constants for the pending_op argument of handle_futex_death */
+#define HANDLE_DEATH_PENDING   true
+#define HANDLE_DEATH_LIST      false
+
 /*
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+                             bool pi, bool pending_op)
 {
        u32 uval, uninitialized_var(nval), mval;
+       int err;
+
+       /* Futex address must be 32bit aligned */
+       if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+               return -1;
 
 retry:
        if (get_user(uval, uaddr))
                return -1;
 
-       if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
-               /*
-                * Ok, this dying thread is truly holding a futex
-                * of interest. Set the OWNER_DIED bit atomically
-                * via cmpxchg, and if the value had FUTEX_WAITERS
-                * set, wake up a waiter (if any). (We have to do a
-                * futex_wake() even if OWNER_DIED is already set -
-                * to handle the rare but possible case of recursive
-                * thread-death.) The rest of the cleanup is done in
-                * userspace.
-                */
-               mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-               /*
-                * We are not holding a lock here, but we want to have
-                * the pagefault_disable/enable() protection because
-                * we want to handle the fault gracefully. If the
-                * access fails we try to fault in the futex with R/W
-                * verification via get_user_pages. get_user() above
-                * does not guarantee R/W access. If that fails we
-                * give up and leave the futex locked.
-                */
-               if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+       /*
+        * Special case for regular (non PI) futexes. The unlock path in
+        * user space has two race scenarios:
+        *
+        * 1. The unlock path releases the user space futex value and
+        *    before it can execute the futex() syscall to wake up
+        *    waiters it is killed.
+        *
+        * 2. A woken up waiter is killed before it can acquire the
+        *    futex in user space.
+        *
+        * In both cases the TID validation below prevents a wakeup of
+        * potential waiters which can cause these waiters to block
+        * forever.
+        *
+        * In both cases the following conditions are met:
+        *
+        *      1) task->robust_list->list_op_pending != NULL
+        *         @pending_op == true
+        *      2) User space futex value == 0
+        *      3) Regular futex: @pi == false
+        *
+        * If these conditions are met, it is safe to attempt waking up a
+        * potential waiter without touching the user space futex value and
+        * trying to set the OWNER_DIED bit. The user space futex value is
+        * uncontended and the rest of the user space mutex state is
+        * consistent, so a woken waiter will just take over the
+        * uncontended futex. Setting the OWNER_DIED bit would create
+        * inconsistent state and malfunction of the user space owner died
+        * handling.
+        */
+       if (pending_op && !pi && !uval) {
+               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+               return 0;
+       }
+
+       if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+               return 0;
+
+       /*
+        * Ok, this dying thread is truly holding a futex
+        * of interest. Set the OWNER_DIED bit atomically
+        * via cmpxchg, and if the value had FUTEX_WAITERS
+        * set, wake up a waiter (if any). (We have to do a
+        * futex_wake() even if OWNER_DIED is already set -
+        * to handle the rare but possible case of recursive
+        * thread-death.) The rest of the cleanup is done in
+        * userspace.
+        */
+       mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+
+       /*
+        * We are not holding a lock here, but we want to have
+        * the pagefault_disable/enable() protection because
+        * we want to handle the fault gracefully. If the
+        * access fails we try to fault in the futex with R/W
+        * verification via get_user_pages. get_user() above
+        * does not guarantee R/W access. If that fails we
+        * give up and leave the futex locked.
+        */
+       if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+               switch (err) {
+               case -EFAULT:
                        if (fault_in_user_writeable(uaddr))
                                return -1;
                        goto retry;
-               }
-               if (nval != uval)
+
+               case -EAGAIN:
+                       cond_resched();
                        goto retry;
 
-               /*
-                * Wake robust non-PI futexes here. The wakeup of
-                * PI futexes happens in exit_pi_state():
-                */
-               if (!pi && (uval & FUTEX_WAITERS))
-                       futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+               default:
+                       WARN_ON_ONCE(1);
+                       return err;
+               }
        }
+
+       if (nval != uval)
+               goto retry;
+
+       /*
+        * Wake robust non-PI futexes here. The wakeup of
+        * PI futexes happens in exit_pi_state():
+        */
+       if (!pi && (uval & FUTEX_WAITERS))
+               futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+
        return 0;
 }
 
@@ -3395,7 +3697,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
@@ -3436,10 +3738,11 @@ void exit_robust_list(struct task_struct *curr)
                 * A pending lock might already be on the list, so
                 * don't process it twice:
                 */
-               if (entry != pending)
+               if (entry != pending) {
                        if (handle_futex_death((void __user *)entry + futex_offset,
-                                               curr, pi))
+                                               curr, pi, HANDLE_DEATH_LIST))
                                return;
+               }
                if (rc)
                        return;
                entry = next_entry;
@@ -3453,9 +3756,118 @@ void exit_robust_list(struct task_struct *curr)
                cond_resched();
        }
 
-       if (pending)
+       if (pending) {
                handle_futex_death((void __user *)pending + futex_offset,
-                                  curr, pip);
+                                  curr, pip, HANDLE_DEATH_PENDING);
+       }
+}
+
+static void futex_cleanup(struct task_struct *tsk)
+{
+       if (unlikely(tsk->robust_list)) {
+               exit_robust_list(tsk);
+               tsk->robust_list = NULL;
+       }
+
+#ifdef CONFIG_COMPAT
+       if (unlikely(tsk->compat_robust_list)) {
+               compat_exit_robust_list(tsk);
+               tsk->compat_robust_list = NULL;
+       }
+#endif
+
+       if (unlikely(!list_empty(&tsk->pi_state_list)))
+               exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk:       task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+       /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+       if (tsk->futex_state == FUTEX_STATE_EXITING)
+               mutex_unlock(&tsk->futex_exit_mutex);
+       tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+       /*
+        * Prevent various race issues against a concurrent incoming waiter
+        * including live locks by forcing the waiter to block on
+        * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+        * attach_to_pi_owner().
+        */
+       mutex_lock(&tsk->futex_exit_mutex);
+
+       /*
+        * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+        *
+        * This ensures that all subsequent checks of tsk->futex_state in
+        * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+        * tsk->pi_lock held.
+        *
+        * It guarantees also that a pi_state which was queued right before
+        * the state change under tsk->pi_lock by a concurrent waiter must
+        * be observed in exit_pi_state_list().
+        */
+       raw_spin_lock_irq(&tsk->pi_lock);
+       tsk->futex_state = FUTEX_STATE_EXITING;
+       raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+       /*
+        * Lockless store. The only side effect is that an observer might
+        * take another loop until it becomes visible.
+        */
+       tsk->futex_state = state;
+       /*
+        * Drop the exit protection. This unblocks waiters which observed
+        * FUTEX_STATE_EXITING to reevaluate the state.
+        */
+       mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+       /*
+        * The state handling is done for consistency, but in the case of
+        * exec() there is no way to prevent futher damage as the PID stays
+        * the same. But for the unlikely and arguably buggy case that a
+        * futex is held on exec(), this provides at least as much state
+        * consistency protection which is possible.
+        */
+       futex_cleanup_begin(tsk);
+       futex_cleanup(tsk);
+       /*
+        * Reset the state to FUTEX_STATE_OK. The task is alive and about
+        * exec a new binary.
+        */
+       futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+       futex_cleanup_begin(tsk);
+       futex_cleanup(tsk);
+       futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
@@ -3551,6 +3963,193 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+                  compat_uptr_t __user *head, unsigned int *pi)
+{
+       if (get_user(*uentry, head))
+               return -EFAULT;
+
+       *entry = compat_ptr((*uentry) & ~1);
+       *pi = (unsigned int)(*uentry) & 1;
+
+       return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+                               compat_long_t futex_offset)
+{
+       compat_uptr_t base = ptr_to_compat(entry);
+       void __user *uaddr = compat_ptr(base + futex_offset);
+
+       return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+static void compat_exit_robust_list(struct task_struct *curr)
+{
+       struct compat_robust_list_head __user *head = curr->compat_robust_list;
+       struct robust_list __user *entry, *next_entry, *pending;
+       unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+       unsigned int uninitialized_var(next_pi);
+       compat_uptr_t uentry, next_uentry, upending;
+       compat_long_t futex_offset;
+       int rc;
+
+       if (!futex_cmpxchg_enabled)
+               return;
+
+       /*
+        * Fetch the list head (which was registered earlier, via
+        * sys_set_robust_list()):
+        */
+       if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+               return;
+       /*
+        * Fetch the relative futex offset:
+        */
+       if (get_user(futex_offset, &head->futex_offset))
+               return;
+       /*
+        * Fetch any possibly pending lock-add first, and handle it
+        * if it exists:
+        */
+       if (compat_fetch_robust_entry(&upending, &pending,
+                              &head->list_op_pending, &pip))
+               return;
+
+       next_entry = NULL;      /* avoid warning with gcc */
+       while (entry != (struct robust_list __user *) &head->list) {
+               /*
+                * Fetch the next entry in the list before calling
+                * handle_futex_death:
+                */
+               rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+                       (compat_uptr_t __user *)&entry->next, &next_pi);
+               /*
+                * A pending lock might already be on the list, so
+                * dont process it twice:
+                */
+               if (entry != pending) {
+                       void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+                       if (handle_futex_death(uaddr, curr, pi,
+                                              HANDLE_DEATH_LIST))
+                               return;
+               }
+               if (rc)
+                       return;
+               uentry = next_uentry;
+               entry = next_entry;
+               pi = next_pi;
+               /*
+                * Avoid excessively long or circular lists:
+                */
+               if (!--limit)
+                       break;
+
+               cond_resched();
+       }
+       if (pending) {
+               void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+               handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+       }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+               struct compat_robust_list_head __user *, head,
+               compat_size_t, len)
+{
+       if (!futex_cmpxchg_enabled)
+               return -ENOSYS;
+
+       if (unlikely(len != sizeof(*head)))
+               return -EINVAL;
+
+       current->compat_robust_list = head;
+
+       return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+                       compat_uptr_t __user *, head_ptr,
+                       compat_size_t __user *, len_ptr)
+{
+       struct compat_robust_list_head __user *head;
+       unsigned long ret;
+       struct task_struct *p;
+
+       if (!futex_cmpxchg_enabled)
+               return -ENOSYS;
+
+       rcu_read_lock();
+
+       ret = -ESRCH;
+       if (!pid)
+               p = current;
+       else {
+               p = find_task_by_vpid(pid);
+               if (!p)
+                       goto err_unlock;
+       }
+
+       ret = -EPERM;
+       if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+               goto err_unlock;
+
+       head = p->compat_robust_list;
+       rcu_read_unlock();
+
+       if (put_user(sizeof(*head), len_ptr))
+               return -EFAULT;
+       return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+       rcu_read_unlock();
+
+       return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+               struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+               u32, val3)
+{
+       struct timespec ts;
+       ktime_t t, *tp = NULL;
+       int val2 = 0;
+       int cmd = op & FUTEX_CMD_MASK;
+
+       if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+                     cmd == FUTEX_WAIT_BITSET ||
+                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
+               if (compat_get_timespec(&ts, utime))
+                       return -EFAULT;
+               if (!timespec_valid(&ts))
+                       return -EINVAL;
+
+               t = timespec_to_ktime(ts);
+               if (cmd == FUTEX_WAIT)
+                       t = ktime_add_safe(ktime_get(), t);
+               tp = &t;
+       }
+       if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+               val2 = (int) (unsigned long) utime;
+
+       return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT */
+
 static void __init futex_detect_cmpxchg(void)
 {
 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG