block, scsi: Make SCSI quiesce and resume work reliably

[mirror_ubuntu-bionic-kernel.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 9eea67ce82d90e2f77649a1bc46975e96f038358..211bc8a3e2cc48f37993fdedeee5ae9dd694f522 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -126,7 +126,8 @@ void blk_freeze_queue_start(struct request_queue *q)
         freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
         if (freeze_depth == 1) {
                 percpu_ref_kill(&q->q_usage_counter);
-               blk_mq_run_hw_queues(q, false);
+               if (q->mq_ops)
+                       blk_mq_run_hw_queues(q, false);
         }
  }
  EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -256,13 +257,6 @@ void blk_mq_wake_waiters(struct request_queue *q)
         queue_for_each_hw_ctx(q, hctx, i)
                 if (blk_mq_hw_queue_mapped(hctx))
                         blk_mq_tag_wakeup_all(hctx->tags, true);
-
-       /*
-        * If we are called because the queue has now been marked as
-        * dying, we need to ensure that processes currently waiting on
-        * the queue are notified as well.
-        */
-       wake_up_all(&q->mq_freeze_wq);
  }
  
  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
@@ -297,6 +291,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
         rq->q = data->q;
         rq->mq_ctx = data->ctx;
         rq->cmd_flags = op;
+       if (data->flags & BLK_MQ_REQ_PREEMPT)
+               rq->rq_flags |= RQF_PREEMPT;
         if (blk_queue_io_stat(data->q))
                 rq->rq_flags |= RQF_IO_STAT;
         /* do not touch atomic flags, it needs atomic ops against the timer */
@@ -337,12 +333,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
         struct elevator_queue *e = q->elevator;
         struct request *rq;
         unsigned int tag;
-       struct blk_mq_ctx *local_ctx = NULL;
+       bool put_ctx_on_error = false;
  
         blk_queue_enter_live(q);
         data->q = q;
-       if (likely(!data->ctx))
-               data->ctx = local_ctx = blk_mq_get_ctx(q);
+       if (likely(!data->ctx)) {
+               data->ctx = blk_mq_get_ctx(q);
+               put_ctx_on_error = true;
+       }
         if (likely(!data->hctx))
                 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
         if (op & REQ_NOWAIT)
@@ -361,8 +359,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
  
         tag = blk_mq_get_tag(data);
         if (tag == BLK_MQ_TAG_FAIL) {
-               if (local_ctx) {
-                       blk_mq_put_ctx(local_ctx);
+               if (put_ctx_on_error) {
+                       blk_mq_put_ctx(data->ctx);
                         data->ctx = NULL;
                 }
                 blk_queue_exit(q);
@@ -391,7 +389,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
         struct request *rq;
         int ret;
  
-       ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
+       ret = blk_queue_enter(q, flags);
         if (ret)
                 return ERR_PTR(ret);
  
@@ -430,7 +428,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         if (hctx_idx >= q->nr_hw_queues)
                 return ERR_PTR(-EIO);
  
-       ret = blk_queue_enter(q, true);
+       ret = blk_queue_enter(q, flags);
         if (ret)
                 return ERR_PTR(ret);
  
@@ -651,6 +649,8 @@ static void __blk_mq_requeue_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
  
+       blk_mq_put_driver_tag(rq);
+
         trace_block_rq_requeue(q, rq);
         wbt_requeue(q->rq_wb, &rq->issue_stat);
         blk_mq_sched_requeue_request(rq);
@@ -994,105 +994,64 @@ done:
         return rq->tag != -1;
  }
  
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-                                   struct request *rq)
-{
-       blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
-       rq->tag = -1;
-
-       if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-               rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-               atomic_dec(&hctx->nr_active);
-       }
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
-                                      struct request *rq)
-{
-       if (rq->tag == -1 || rq->internal_tag == -1)
-               return;
-
-       __blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
-{
-       struct blk_mq_hw_ctx *hctx;
-
-       if (rq->tag == -1 || rq->internal_tag == -1)
-               return;
-
-       hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
-       __blk_mq_put_driver_tag(hctx, rq);
-}
-
-/*
- * If we fail getting a driver tag because all the driver tags are already
- * assigned and on the dispatch list, BUT the first entry does not have a
- * tag, then we could deadlock. For that case, move entries with assigned
- * driver tags to the front, leaving the set of tagged requests in the
- * same order, and the untagged set in the same order.
- */
-static bool reorder_tags_to_front(struct list_head *list)
-{
-       struct request *rq, *tmp, *first = NULL;
-
-       list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
-               if (rq == first)
-                       break;
-               if (rq->tag != -1) {
-                       list_move(&rq->queuelist, list);
-                       if (!first)
-                               first = rq;
-               }
-       }
-
-       return first != NULL;
-}
-
-static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
-                               void *key)
+static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
+                               int flags, void *key)
  {
         struct blk_mq_hw_ctx *hctx;
  
         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
  
-       list_del(&wait->entry);
-       clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+       list_del_init(&wait->entry);
         blk_mq_run_hw_queue(hctx, true);
         return 1;
  }
  
-static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx **hctx,
+                                    struct request *rq)
  {
+       struct blk_mq_hw_ctx *this_hctx = *hctx;
+       wait_queue_entry_t *wait = &this_hctx->dispatch_wait;
         struct sbq_wait_state *ws;
  
+       if (!list_empty_careful(&wait->entry))
+               return false;
+
+       spin_lock(&this_hctx->lock);
+       if (!list_empty(&wait->entry)) {
+               spin_unlock(&this_hctx->lock);
+               return false;
+       }
+
+       ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+       add_wait_queue(&ws->wait, wait);
+
         /*
-        * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
-        * The thread which wins the race to grab this bit adds the hardware
-        * queue to the wait queue.
+        * It's possible that a tag was freed in the window between the
+        * allocation failure and adding the hardware queue to the wait
+        * queue.
          */
-       if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
-           test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+       if (!blk_mq_get_driver_tag(rq, hctx, false)) {
+               spin_unlock(&this_hctx->lock);
                 return false;
-
-       init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
-       ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+       }
  
         /*
-        * As soon as this returns, it's no longer safe to fiddle with
-        * hctx->dispatch_wait, since a completion can wake up the wait queue
-        * and unlock the bit.
+        * We got a tag, remove ourselves from the wait queue to ensure
+        * someone else gets the wakeup.
          */
-       add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+       spin_lock_irq(&ws->wait.lock);
+       list_del_init(&wait->entry);
+       spin_unlock_irq(&ws->wait.lock);
+       spin_unlock(&this_hctx->lock);
         return true;
  }
  
  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
-               bool got_budget)
+                            bool got_budget)
  {
         struct blk_mq_hw_ctx *hctx;
-       struct request *rq;
+       struct request *rq, *nxt;
+       bool no_tag = false;
         int errors, queued;
  
         if (list_empty(list))
@@ -1110,37 +1069,24 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  
                 rq = list_first_entry(list, struct request, queuelist);
                 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
-                       if (!queued && reorder_tags_to_front(list))
-                               continue;
-
                         /*
                          * The initial allocation attempt failed, so we need to
-                        * rerun the hardware queue when a tag is freed.
+                        * rerun the hardware queue when a tag is freed. The
+                        * waitqueue takes care of that. If the queue is run
+                        * before we add this entry back on the dispatch list,
+                        * we'll re-run it below.
                          */
-                       if (!blk_mq_dispatch_wait_add(hctx)) {
-                               if (got_budget)
-                                       blk_mq_put_dispatch_budget(hctx);
-                               break;
-                       }
-
-                       /*
-                        * It's possible that a tag was freed in the window
-                        * between the allocation failure and adding the
-                        * hardware queue to the wait queue.
-                        */
-                       if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+                       if (!blk_mq_dispatch_wait_add(&hctx, rq)) {
                                 if (got_budget)
                                         blk_mq_put_dispatch_budget(hctx);
+                               no_tag = true;
                                 break;
                         }
                 }
  
-               if (!got_budget) {
-                       ret = blk_mq_get_dispatch_budget(hctx);
-                       if (ret == BLK_STS_RESOURCE)
-                               break;
-                       if (ret != BLK_STS_OK)
-                               goto fail_rq;
+               if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+                       blk_mq_put_driver_tag(rq);
+                       break;
                 }
  
                 list_del_init(&rq->queuelist);
@@ -1154,21 +1100,25 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                 if (list_empty(list))
                         bd.last = true;
                 else {
-                       struct request *nxt;
-
                         nxt = list_first_entry(list, struct request, queuelist);
                         bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
                 }
  
                 ret = q->mq_ops->queue_rq(hctx, &bd);
                 if (ret == BLK_STS_RESOURCE) {
-                       blk_mq_put_driver_tag_hctx(hctx, rq);
+                       /*
+                        * If an I/O scheduler has been configured and we got a
+                        * driver tag for the next request already, free it again.
+                        */
+                       if (!list_empty(list)) {
+                               nxt = list_first_entry(list, struct request, queuelist);
+                               blk_mq_put_driver_tag(nxt);
+                       }
                         list_add(&rq->queuelist, list);
                         __blk_mq_requeue_request(rq);
                         break;
                 }
  
- fail_rq:
                 if (unlikely(ret != BLK_STS_OK)) {
                         errors++;
                         blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1185,13 +1135,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
          * that is where we will continue on next queue run.
          */
         if (!list_empty(list)) {
-               /*
-                * If an I/O scheduler has been configured and we got a driver
-                * tag for the next request already, free it again.
-                */
-               rq = list_first_entry(list, struct request, queuelist);
-               blk_mq_put_driver_tag(rq);
-
                 spin_lock(&hctx->lock);
                 list_splice_init(list, &hctx->dispatch);
                 spin_unlock(&hctx->lock);
@@ -1201,10 +1144,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                  * it is no longer set that means that it was cleared by another
                  * thread and hence that a queue rerun is needed.
                  *
-                * If TAG_WAITING is set that means that an I/O scheduler has
-                * been configured and another thread is waiting for a driver
-                * tag. To guarantee fairness, do not rerun this hardware queue
-                * but let the other thread grab the driver tag.
+                * If 'no_tag' is set, that means that we failed getting
+                * a driver tag with an I/O scheduler attached. If our dispatch
+                * waitqueue is no longer active, ensure that we run the queue
+                * AFTER adding our entries back to the list.
                  *
                  * If no I/O scheduler has been configured it is possible that
                  * the hardware queue got stopped and restarted before requests
@@ -1216,8 +1159,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
                  *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                  *   and dm-rq.
                  */
-               if (!blk_mq_sched_needs_restart(hctx) &&
-                   !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
+               if (!blk_mq_sched_needs_restart(hctx) ||
+                   (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                         blk_mq_run_hw_queue(hctx, true);
         }
  
@@ -1497,7 +1440,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
   * Should only be used carefully, when the caller knows we want to
   * bypass a potential IO scheduler on the target device.
   */
-void blk_mq_request_bypass_insert(struct request *rq)
+void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
@@ -1506,7 +1449,8 @@ void blk_mq_request_bypass_insert(struct request *rq)
         list_add_tail(&rq->queuelist, &hctx->dispatch);
         spin_unlock(&hctx->lock);
  
-       blk_mq_run_hw_queue(hctx, false);
+       if (run_queue)
+               blk_mq_run_hw_queue(hctx, false);
  }
  
  void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
@@ -1640,12 +1584,10 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
         if (!blk_mq_get_driver_tag(rq, NULL, false))
                 goto insert;
  
-       ret = blk_mq_get_dispatch_budget(hctx);
-       if (ret == BLK_STS_RESOURCE) {
+       if (!blk_mq_get_dispatch_budget(hctx)) {
                 blk_mq_put_driver_tag(rq);
                 goto insert;
-       } else if (ret != BLK_STS_OK)
-               goto fail_rq;
+       }
  
         new_cookie = request_to_qc_t(hctx, rq);
  
@@ -1663,7 +1605,6 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                 __blk_mq_requeue_request(rq);
                 goto insert;
         default:
- fail_rq:
                 *cookie = BLK_QC_T_NONE;
                 blk_mq_end_request(rq, ret);
                 return;
@@ -1737,13 +1678,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         if (unlikely(is_flush_fua)) {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
-               if (q->elevator) {
-                       blk_mq_sched_insert_request(rq, false, true, true,
-                                       true);
-               } else {
-                       blk_insert_flush(rq);
-                       blk_mq_run_hw_queue(data.hctx, true);
-               }
+
+               /* bypass scheduler for flush rq */
+               blk_insert_flush(rq);
+               blk_mq_run_hw_queue(data.hctx, true);
         } else if (plug && q->nr_hw_queues == 1) {
                 struct request *last = NULL;
  
@@ -2086,6 +2024,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
  
         hctx->nr_ctx = 0;
  
+       init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+       INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
+
         if (set->ops->init_hctx &&
             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                 goto free_bitmap;