tun: export skb_array

[mirror_ubuntu-bionic-kernel.git] / kernel / futex.c
diff --git a/kernel/futex.c b/kernel/futex.c

index 45858ec739411f5741667e560552757697441e6b..357348a6cf6b4d71dbc24c9ad89ad2d0e63b20d2 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
         return 0;
  }
  
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
  {
         struct futex_pi_state *pi_state = current->pi_state_cache;
  
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
         return pi_state;
  }
  
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+       WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
  /*
   * Drops a reference to the pi_state object and frees or caches it
   * when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
   * Look up the task based on what TID userspace gave us.
   * We dont trust it.
   */
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
  {
         struct task_struct *p;
  
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
                 pi_state->owner = NULL;
                 raw_spin_unlock_irq(&curr->pi_lock);
  
-               rt_mutex_unlock(&pi_state->pi_mutex);
-
+               get_pi_state(pi_state);
                 spin_unlock(&hb->lock);
  
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+
                 raw_spin_lock_irq(&curr->pi_lock);
         }
         raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
   *
   * [10] There is no transient state which leaves owner and user space
   *     TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ *     hb -> futex_q, relation
+ *     futex_q -> pi_state, relation
+ *
+ *     (cannot be raw because hb can contain arbitrary amount
+ *      of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ *     {uval, pi_state}
+ *
+ *     (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ *     p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ *     pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ *   hb->lock
+ *     pi_mutex->wait_lock
+ *       p->pi_lock
+ *
   */
  
  /*
@@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr)
   * the pi_state against the user space value. If correct, attach to
   * it.
   */
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+                             struct futex_pi_state *pi_state,
                               struct futex_pi_state **ps)
  {
         pid_t pid = uval & FUTEX_TID_MASK;
+       u32 uval2;
+       int ret;
  
         /*
          * Userspace might have messed up non-PI and PI futexes [3]
@@ -991,8 +1034,38 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
         if (unlikely(!pi_state))
                 return -EINVAL;
  
+       /*
+        * We get here with hb->lock held, and having found a
+        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+        * which in turn means that futex_lock_pi() still has a reference on
+        * our pi_state.
+        *
+        * The waiter holding a reference on @pi_state also protects against
+        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+        * free pi_state before we can take a reference ourselves.
+        */
         WARN_ON(!atomic_read(&pi_state->refcount));
  
+       /*
+        * Now that we have a pi_state, we can acquire wait_lock
+        * and do the state validation.
+        */
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       /*
+        * Since {uval, pi_state} is serialized by wait_lock, and our current
+        * uval was read without holding it, it can have changed. Verify it
+        * still is what we expect it to be, otherwise retry the entire
+        * operation.
+        */
+       if (get_futex_value_locked(&uval2, uaddr))
+               goto out_efault;
+
+       if (uval != uval2)
+               goto out_eagain;
+
         /*
          * Handle the owner died case:
          */
@@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                          * is not 0. Inconsistent state. [5]
                          */
                         if (pid)
-                               return -EINVAL;
+                               goto out_einval;
                         /*
                          * Take a ref on the state and return success. [4]
                          */
-                       goto out_state;
+                       goto out_attach;
                 }
  
                 /*
@@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                  * Take a ref on the state and return success. [6]
                  */
                 if (!pid)
-                       goto out_state;
+                       goto out_attach;
         } else {
                 /*
                  * If the owner died bit is not set, then the pi_state
                  * must have an owner. [7]
                  */
                 if (!pi_state->owner)
-                       return -EINVAL;
+                       goto out_einval;
         }
  
         /*
@@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
          * user space TID. [9/10]
          */
         if (pid != task_pid_vnr(pi_state->owner))
-               return -EINVAL;
-out_state:
-       atomic_inc(&pi_state->refcount);
+               goto out_einval;
+
+out_attach:
+       get_pi_state(pi_state);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
         *ps = pi_state;
         return 0;
+
+out_einval:
+       ret = -EINVAL;
+       goto out_error;
+
+out_eagain:
+       ret = -EAGAIN;
+       goto out_error;
+
+out_efault:
+       ret = -EFAULT;
+       goto out_error;
+
+out_error:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
  }
  
  /*
@@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
  
         /*
          * No existing pi state. First waiter. [2]
+        *
+        * This creates pi_state, we have hb->lock held, this means nothing can
+        * observe this state, wait_lock is irrelevant.
          */
         pi_state = alloc_pi_state();
  
@@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
         return 0;
  }
  
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+                          struct futex_hash_bucket *hb,
                            union futex_key *key, struct futex_pi_state **ps)
  {
-       struct futex_q *match = futex_top_waiter(hb, key);
+       struct futex_q *top_waiter = futex_top_waiter(hb, key);
  
         /*
          * If there is a waiter on that futex, validate it and
          * attach to the pi_state when the validation succeeds.
          */
-       if (match)
-               return attach_to_pi_state(uval, match->pi_state, ps);
+       if (top_waiter)
+               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
  
         /*
          * We are the first waiter - try to look up the owner based on
@@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                 return -EFAULT;
  
-       /*If user space value changed, let the caller retry */
+       /* If user space value changed, let the caller retry */
         return curval != uval ? -EAGAIN : 0;
  }
  
@@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                 struct task_struct *task, int set_waiters)
  {
         u32 uval, newval, vpid = task_pid_vnr(task);
-       struct futex_q *match;
+       struct futex_q *top_waiter;
         int ret;
  
         /*
@@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
          * Lookup existing state first. If it exists, try to attach to
          * its pi_state.
          */
-       match = futex_top_waiter(hb, key);
-       if (match)
-               return attach_to_pi_state(uval, match->pi_state, ps);
+       top_waiter = futex_top_waiter(hb, key);
+       if (top_waiter)
+               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
  
         /*
          * No waiter and user TID is 0. We are here because the
@@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
         wake_q_add(wake_q, p);
         __unqueue_futex(q);
         /*
-        * The waiting task can free the futex_q as soon as
-        * q->lock_ptr = NULL is written, without taking any locks. A
-        * memory barrier is required here to prevent the following
-        * store to lock_ptr from getting ahead of the plist_del.
+        * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+        * is written, without taking any locks. This is possible in the event
+        * of a spurious wakeup, for example. A memory barrier is required here
+        * to prevent the following store to lock_ptr from getting ahead of the
+        * plist_del in __unqueue_futex().
          */
-       smp_wmb();
-       q->lock_ptr = NULL;
+       smp_store_release(&q->lock_ptr, NULL);
  }
  
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
-                        struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
  {
-       struct task_struct *new_owner;
-       struct futex_pi_state *pi_state = this->pi_state;
         u32 uninitialized_var(curval), newval;
+       struct task_struct *new_owner;
+       bool postunlock = false;
         DEFINE_WAKE_Q(wake_q);
-       bool deboost;
         int ret = 0;
  
-       if (!pi_state)
-               return -EINVAL;
-
-       /*
-        * If current does not own the pi_state then the futex is
-        * inconsistent and user space fiddled with the futex value.
-        */
-       if (pi_state->owner != current)
-               return -EINVAL;
-
-       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+       if (WARN_ON_ONCE(!new_owner)) {
+               /*
+                * As per the comment in futex_unlock_pi() this should not happen.
+                *
+                * When this happens, give up our locks and try again, giving
+                * the futex_lock_pi() instance time to complete, either by
+                * waiting on the rtmutex or removing itself from the futex
+                * queue.
+                */
+               ret = -EAGAIN;
+               goto out_unlock;
+       }
  
         /*
-        * It is possible that the next waiter (the one that brought
-        * this owner to the kernel) timed out and is no longer
-        * waiting on the lock.
-        */
-       if (!new_owner)
-               new_owner = this->task;
-
-       /*
-        * We pass it to the next owner. The WAITERS bit is always
-        * kept enabled while there is PI state around. We cleanup the
-        * owner died bit, because we are the owner.
+        * We pass it to the next owner. The WAITERS bit is always kept
+        * enabled while there is PI state around. We cleanup the owner
+        * died bit, because we are the owner.
          */
         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
  
@@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
  
         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
                 ret = -EFAULT;
+
         } else if (curval != uval) {
                 /*
                  * If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
                 else
                         ret = -EINVAL;
         }
-       if (ret) {
-               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-               return ret;
-       }
+
+       if (ret)
+               goto out_unlock;
+
+       /*
+        * This is a point of no return; once we modify the uval there is no
+        * going back and subsequent operations must not fail.
+        */
  
         raw_spin_lock(&pi_state->owner->pi_lock);
         WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
         pi_state->owner = new_owner;
         raw_spin_unlock(&new_owner->pi_lock);
  
-       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
  
-       deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  
-       /*
-        * First unlock HB so the waiter does not spin on it once he got woken
-        * up. Second wake up the waiter before the priority is adjusted. If we
-        * deboost first (and lose our higher priority), then the task might get
-        * scheduled away before the wake up can take place.
-        */
-       spin_unlock(&hb->lock);
-       wake_up_q(&wake_q);
-       if (deboost)
-               rt_mutex_adjust_prio(current);
+       if (postunlock)
+               rt_mutex_postunlock(&wake_q);
  
-       return 0;
+       return ret;
  }
  
  /*
@@ -1826,7 +1913,7 @@ retry_private:
                          * If that call succeeds then we have pi_state and an
                          * initial refcount on it.
                          */
-                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
                 }
  
                 switch (ret) {
@@ -1909,7 +1996,7 @@ retry_private:
                          * refcount on the pi_state and store the pointer in
                          * the futex_q object of the waiter.
                          */
-                       atomic_inc(&pi_state->refcount);
+                       get_pi_state(pi_state);
                         this->pi_state = pi_state;
                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
                                                         this->rt_waiter,
@@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb)
         hb_waiters_dec(hb);
  }
  
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb:        The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me().  The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-       __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
  {
         int prio;
  
@@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
         plist_node_init(&q->list, prio);
         plist_add(&q->list, &hb->chain);
         q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb:        The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me().  The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+       __releases(&hb->lock)
+{
+       __queue_me(q, hb);
         spin_unlock(&hb->lock);
  }
  
@@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  {
         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
         struct futex_pi_state *pi_state = q->pi_state;
-       struct task_struct *oldowner = pi_state->owner;
         u32 uval, uninitialized_var(curval), newval;
+       struct task_struct *oldowner;
         int ret;
  
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       oldowner = pi_state->owner;
         /* Owner died? */
         if (!pi_state->owner)
                 newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
         /*
          * We are here either because we stole the rtmutex from the
          * previous highest priority waiter or we are the highest priority
-        * waiter but failed to get the rtmutex the first time.
+        * waiter but have failed to get the rtmutex the first time.
+        *
          * We have to replace the newowner TID in the user space variable.
          * This must be atomic as we have to preserve the owner died bit here.
          *
@@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
          * because we can fault here. Imagine swapped out pages or a fork
          * that marked all the anonymous memory readonly for cow.
          *
-        * Modifying pi_state _before_ the user space value would
-        * leave the pi_state in an inconsistent state when we fault
-        * here, because we need to drop the hash bucket lock to
-        * handle the fault. This might be observed in the PID check
-        * in lookup_pi_state.
+        * Modifying pi_state _before_ the user space value would leave the
+        * pi_state in an inconsistent state when we fault here, because we
+        * need to drop the locks to handle the fault. This might be observed
+        * in the PID check in lookup_pi_state.
          */
  retry:
         if (get_futex_value_locked(&uval, uaddr))
                 goto handle_fault;
  
-       while (1) {
+       for (;;) {
                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
  
                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2264,60 @@ retry:
          * itself.
          */
         if (pi_state->owner != NULL) {
-               raw_spin_lock_irq(&pi_state->owner->pi_lock);
+               raw_spin_lock(&pi_state->owner->pi_lock);
                 WARN_ON(list_empty(&pi_state->list));
                 list_del_init(&pi_state->list);
-               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+               raw_spin_unlock(&pi_state->owner->pi_lock);
         }
  
         pi_state->owner = newowner;
  
-       raw_spin_lock_irq(&newowner->pi_lock);
+       raw_spin_lock(&newowner->pi_lock);
         WARN_ON(!list_empty(&pi_state->list));
         list_add(&pi_state->list, &newowner->pi_state_list);
-       raw_spin_unlock_irq(&newowner->pi_lock);
+       raw_spin_unlock(&newowner->pi_lock);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
         return 0;
  
         /*
-        * To handle the page fault we need to drop the hash bucket
-        * lock here. That gives the other task (either the highest priority
-        * waiter itself or the task which stole the rtmutex) the
-        * chance to try the fixup of the pi_state. So once we are
-        * back from handling the fault we need to check the pi_state
-        * after reacquiring the hash bucket lock and before trying to
-        * do another fixup. When the fixup has been done already we
-        * simply return.
+        * To handle the page fault we need to drop the locks here. That gives
+        * the other task (either the highest priority waiter itself or the
+        * task which stole the rtmutex) the chance to try the fixup of the
+        * pi_state. So once we are back from handling the fault we need to
+        * check the pi_state after reacquiring the locks and before trying to
+        * do another fixup. When the fixup has been done already we simply
+        * return.
+        *
+        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+        * drop hb->lock since the caller owns the hb -> futex_q relation.
+        * Dropping the pi_mutex->wait_lock requires the state revalidate.
          */
  handle_fault:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
         spin_unlock(q->lock_ptr);
  
         ret = fault_in_user_writeable(uaddr);
  
         spin_lock(q->lock_ptr);
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  
         /*
          * Check if someone else fixed it for us:
          */
-       if (pi_state->owner != oldowner)
-               return 0;
+       if (pi_state->owner != oldowner) {
+               ret = 0;
+               goto out_unlock;
+       }
  
         if (ret)
-               return ret;
+               goto out_unlock;
  
         goto retry;
+
+out_unlock:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
  }
  
  static long futex_wait_restart(struct restart_block *restart);
@@ -2231,57 +2339,32 @@ static long futex_wait_restart(struct restart_block *restart);
   */
  static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  {
-       struct task_struct *owner;
         int ret = 0;
  
         if (locked) {
                 /*
                  * Got the lock. We might not be the anticipated owner if we
                  * did a lock-steal - fix up the PI-state in that case:
+                *
+                * We can safely read pi_state->owner without holding wait_lock
+                * because we now own the rt_mutex, only the owner will attempt
+                * to change it.
                  */
                 if (q->pi_state->owner != current)
                         ret = fixup_pi_state_owner(uaddr, q, current);
                 goto out;
         }
  
-       /*
-        * Catch the rare case, where the lock was released when we were on the
-        * way back before we locked the hash bucket.
-        */
-       if (q->pi_state->owner == current) {
-               /*
-                * Try to get the rt_mutex now. This might fail as some other
-                * task acquired the rt_mutex after we removed ourself from the
-                * rt_mutex waiters list.
-                */
-               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
-                       locked = 1;
-                       goto out;
-               }
-
-               /*
-                * pi_state is incorrect, some other task did a lock steal and
-                * we returned due to timeout or signal without taking the
-                * rt_mutex. Too late.
-                */
-               raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
-               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-               if (!owner)
-                       owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
-               raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
-               ret = fixup_pi_state_owner(uaddr, q, owner);
-               goto out;
-       }
-
         /*
          * Paranoia check. If we did not take the lock, then we should not be
          * the owner of the rt_mutex.
          */
-       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
                                 "pi-state %p\n", ret,
                                 q->pi_state->pi_mutex.owner,
                                 q->pi_state->owner);
+       }
  
  out:
         return ret ? ret : locked;
@@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                          ktime_t *time, int trylock)
  {
         struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
+       struct rt_mutex_waiter rt_waiter;
         struct futex_hash_bucket *hb;
         struct futex_q q = futex_q_init;
         int res, ret;
@@ -2557,24 +2642,67 @@ retry_private:
                 }
         }
  
+       WARN_ON(!q.pi_state);
+
         /*
          * Only actually queue now that the atomic ops are done:
          */
-       queue_me(&q, hb);
+       __queue_me(&q, hb);
  
-       WARN_ON(!q.pi_state);
-       /*
-        * Block on the PI mutex:
-        */
-       if (!trylock) {
-               ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
-       } else {
-               ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+       if (trylock) {
+               ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
                 /* Fixup the trylock return value: */
                 ret = ret ? 0 : -EWOULDBLOCK;
+               goto no_block;
         }
  
+       rt_mutex_init_waiter(&rt_waiter);
+
+       /*
+        * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+        * hold it while doing rt_mutex_start_proxy(), because then it will
+        * include hb->lock in the blocking chain, even through we'll not in
+        * fact hold it while blocking. This will lead it to report -EDEADLK
+        * and BUG when futex_unlock_pi() interleaves with this.
+        *
+        * Therefore acquire wait_lock while holding hb->lock, but drop the
+        * latter before calling rt_mutex_start_proxy_lock(). This still fully
+        * serializes against futex_unlock_pi() as that does the exact same
+        * lock handoff sequence.
+        */
+       raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+       spin_unlock(q.lock_ptr);
+       ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+       raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+       if (ret) {
+               if (ret == 1)
+                       ret = 0;
+
+               spin_lock(q.lock_ptr);
+               goto no_block;
+       }
+
+
+       if (unlikely(to))
+               hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+       ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
         spin_lock(q.lock_ptr);
+       /*
+        * If we failed to acquire the lock (signal/timeout), we must
+        * first acquire the hb->lock before removing the lock from the
+        * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+        * wait lists consistent.
+        *
+        * In particular; it is important that futex_unlock_pi() can not
+        * observe this inconsistency.
+        */
+       if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+               ret = 0;
+
+no_block:
         /*
          * Fixup the pi_state owner and possibly acquire the lock if we
          * haven't already.
@@ -2591,12 +2719,19 @@ retry_private:
          * If fixup_owner() faulted and was unable to handle the fault, unlock
          * it and return the fault to userspace.
          */
-       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
-               rt_mutex_unlock(&q.pi_state->pi_mutex);
+       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+               pi_state = q.pi_state;
+               get_pi_state(pi_state);
+       }
  
         /* Unqueue and drop the lock */
         unqueue_me_pi(&q);
  
+       if (pi_state) {
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+       }
+
         goto out_put_key;
  
  out_unlock_put_key:
@@ -2605,8 +2740,10 @@ out_unlock_put_key:
  out_put_key:
         put_futex_key(&q.key);
  out:
-       if (to)
+       if (to) {
+               hrtimer_cancel(&to->timer);
                 destroy_hrtimer_on_stack(&to->timer);
+       }
         return ret != -EINTR ? ret : -ERESTARTNOINTR;
  
  uaddr_faulted:
@@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
         u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
         union futex_key key = FUTEX_KEY_INIT;
         struct futex_hash_bucket *hb;
-       struct futex_q *match;
+       struct futex_q *top_waiter;
         int ret;
  
  retry:
@@ -2657,12 +2794,37 @@ retry:
          * all and we at least want to know if user space fiddled
          * with the futex value instead of blindly unlocking.
          */
-       match = futex_top_waiter(hb, &key);
-       if (match) {
-               ret = wake_futex_pi(uaddr, uval, match, hb);
+       top_waiter = futex_top_waiter(hb, &key);
+       if (top_waiter) {
+               struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+               ret = -EINVAL;
+               if (!pi_state)
+                       goto out_unlock;
+
                 /*
-                * In case of success wake_futex_pi dropped the hash
-                * bucket lock.
+                * If current does not own the pi_state then the futex is
+                * inconsistent and user space fiddled with the futex value.
+                */
+               if (pi_state->owner != current)
+                       goto out_unlock;
+
+               get_pi_state(pi_state);
+               /*
+                * By taking wait_lock while still holding hb->lock, we ensure
+                * there is no point where we hold neither; and therefore
+                * wake_futex_pi() must observe a state consistent with what we
+                * observed.
+                */
+               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+               spin_unlock(&hb->lock);
+
+               ret = wake_futex_pi(uaddr, uval, pi_state);
+
+               put_pi_state(pi_state);
+
+               /*
+                * Success, we're done! No tricky corner cases.
                  */
                 if (!ret)
                         goto out_putkey;
@@ -2677,7 +2839,6 @@ retry:
                  * setting the FUTEX_WAITERS bit. Try again.
                  */
                 if (ret == -EAGAIN) {
-                       spin_unlock(&hb->lock);
                         put_futex_key(&key);
                         goto retry;
                 }
@@ -2685,7 +2846,7 @@ retry:
                  * wake_futex_pi has detected invalid state. Tell user
                  * space.
                  */
-               goto out_unlock;
+               goto out_putkey;
         }
  
         /*
@@ -2695,8 +2856,10 @@ retry:
          * preserve the WAITERS bit not the OWNER_DIED one. We are the
          * owner.
          */
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+               spin_unlock(&hb->lock);
                 goto pi_faulted;
+       }
  
         /*
          * If uval has changed, let user space handle it.
@@ -2710,7 +2873,6 @@ out_putkey:
         return ret;
  
  pi_faulted:
-       spin_unlock(&hb->lock);
         put_futex_key(&key);
  
         ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                  u32 __user *uaddr2)
  {
         struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
         struct rt_mutex_waiter rt_waiter;
         struct futex_hash_bucket *hb;
         union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
          * The waiter is allocated on our stack, manipulated by the requeue
          * code while we sleep on uaddr.
          */
-       debug_rt_mutex_init_waiter(&rt_waiter);
-       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
-       RB_CLEAR_NODE(&rt_waiter.tree_entry);
-       rt_waiter.task = NULL;
+       rt_mutex_init_waiter(&rt_waiter);
  
         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
         if (unlikely(ret != 0))
@@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 if (q.pi_state && (q.pi_state->owner != current)) {
                         spin_lock(q.lock_ptr);
                         ret = fixup_pi_state_owner(uaddr2, &q, current);
-                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
-                               rt_mutex_unlock(&q.pi_state->pi_mutex);
+                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+                               pi_state = q.pi_state;
+                               get_pi_state(pi_state);
+                       }
                         /*
                          * Drop the reference to the pi state which
                          * the requeue_pi() code acquired for us.
@@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                  */
                 WARN_ON(!q.pi_state);
                 pi_mutex = &q.pi_state->pi_mutex;
-               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
-               debug_rt_mutex_free_waiter(&rt_waiter);
+               ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
  
                 spin_lock(q.lock_ptr);
+               if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+                       ret = 0;
+
+               debug_rt_mutex_free_waiter(&rt_waiter);
                 /*
                  * Fixup the pi_state owner and possibly acquire the lock if we
                  * haven't already.
@@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                  * the fault, unlock the rt_mutex and return the fault to
                  * userspace.
                  */
-               if (ret && rt_mutex_owner(pi_mutex) == current)
-                       rt_mutex_unlock(pi_mutex);
+               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+                       pi_state = q.pi_state;
+                       get_pi_state(pi_state);
+               }
  
                 /* Unqueue and drop the lock. */
                 unqueue_me_pi(&q);
         }
  
+       if (pi_state) {
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+       }
+
         if (ret == -EINTR) {
                 /*
                  * We've already been requeued, but cannot restart by calling