]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blobdiff - block/blk-mq.c
blk-mq: Export blk_mq_freeze_queue_wait
[mirror_ubuntu-hirsute-kernel.git] / block / blk-mq.c
index b29e7dc7b309e4cf939cd2c011b26b38dfd4df73..8da2c04bb88f4762a07149e48223a6df66957af3 100644 (file)
@@ -75,10 +75,11 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
 
-static void blk_mq_freeze_queue_wait(struct request_queue *q)
+void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 }
+EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 
 /*
  * Guarantee no request is in use, so we can change any data structure of
@@ -234,6 +235,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
                        }
                        rq->tag = tag;
                        rq->internal_tag = -1;
+                       data->hctx->tags->rqs[rq->tag] = rq;
                }
 
                blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
@@ -273,10 +275,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
                unsigned int flags, unsigned int hctx_idx)
 {
-       struct blk_mq_hw_ctx *hctx;
-       struct blk_mq_ctx *ctx;
+       struct blk_mq_alloc_data alloc_data = { .flags = flags };
        struct request *rq;
-       struct blk_mq_alloc_data alloc_data;
+       unsigned int cpu;
        int ret;
 
        /*
@@ -299,25 +300,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
         * Check if the hardware context is actually mapped to anything.
         * If not tell the caller that it should skip this queue.
         */
-       hctx = q->queue_hw_ctx[hctx_idx];
-       if (!blk_mq_hw_queue_mapped(hctx)) {
-               ret = -EXDEV;
-               goto out_queue_exit;
+       alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
+       if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
+               blk_queue_exit(q);
+               return ERR_PTR(-EXDEV);
        }
-       ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+       cpu = cpumask_first(alloc_data.hctx->cpumask);
+       alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 
-       blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-       rq = __blk_mq_alloc_request(&alloc_data, rw);
-       if (!rq) {
-               ret = -EWOULDBLOCK;
-               goto out_queue_exit;
-       }
-
-       return rq;
+       rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-out_queue_exit:
+       blk_mq_put_ctx(alloc_data.ctx);
        blk_queue_exit(q);
-       return ERR_PTR(ret);
+
+       if (!rq)
+               return ERR_PTR(-EWOULDBLOCK);
+
+       return rq;
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
@@ -852,6 +851,9 @@ done:
                return true;
        }
 
+       if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
+               data.flags |= BLK_MQ_REQ_RESERVED;
+
        rq->tag = blk_mq_get_tag(&data);
        if (rq->tag >= 0) {
                if (blk_mq_tag_busy(data.hctx)) {
@@ -904,6 +906,44 @@ static bool reorder_tags_to_front(struct list_head *list)
        return first != NULL;
 }
 
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+                               void *key)
+{
+       struct blk_mq_hw_ctx *hctx;
+
+       hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+       list_del(&wait->task_list);
+       clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+       blk_mq_run_hw_queue(hctx, true);
+       return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+       struct sbq_wait_state *ws;
+
+       /*
+        * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+        * The thread which wins the race to grab this bit adds the hardware
+        * queue to the wait queue.
+        */
+       if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+           test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+               return false;
+
+       init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+       ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+       /*
+        * As soon as this returns, it's no longer safe to fiddle with
+        * hctx->dispatch_wait, since a completion can wake up the wait queue
+        * and unlock the bit.
+        */
+       add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+       return true;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
        struct request_queue *q = hctx->queue;
@@ -931,15 +971,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                                continue;
 
                        /*
-                        * We failed getting a driver tag. Mark the queue(s)
-                        * as needing a restart. Retry getting a tag again,
-                        * in case the needed IO completed right before we
-                        * marked the queue as needing a restart.
+                        * The initial allocation attempt failed, so we need to
+                        * rerun the hardware queue when a tag is freed.
                         */
-                       blk_mq_sched_mark_restart(hctx);
-                       if (!blk_mq_get_driver_tag(rq, &hctx, false))
+                       if (blk_mq_dispatch_wait_add(hctx)) {
+                               /*
+                                * It's possible that a tag was freed in the
+                                * window between the allocation failure and
+                                * adding the hardware queue to the wait queue.
+                                */
+                               if (!blk_mq_get_driver_tag(rq, &hctx, false))
+                                       break;
+                       } else {
                                break;
+                       }
                }
+
                list_del_init(&rq->queuelist);
 
                bd.rq = rq;
@@ -995,10 +1042,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
                 *
                 * blk_mq_run_hw_queue() already checks the STOPPED bit
                 *
-                * If RESTART is set, then let completion restart the queue
-                * instead of potentially looping here.
+                * If RESTART or TAG_WAITING is set, then let completion restart
+                * the queue instead of potentially looping here.
                 */
-               if (!blk_mq_sched_needs_restart(hctx))
+               if (!blk_mq_sched_needs_restart(hctx) &&
+                   !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
                        blk_mq_run_hw_queue(hctx, true);
        }
 
@@ -1667,16 +1715,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                        unsigned int reserved_tags)
 {
        struct blk_mq_tags *tags;
+       int node;
+
+       node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+       if (node == NUMA_NO_NODE)
+               node = set->numa_node;
 
-       tags = blk_mq_init_tags(nr_tags, reserved_tags,
-                               set->numa_node,
+       tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
                                BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
        if (!tags)
                return NULL;
 
        tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-                                set->numa_node);
+                                node);
        if (!tags->rqs) {
                blk_mq_free_tags(tags);
                return NULL;
@@ -1684,7 +1736,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 
        tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-                                set->numa_node);
+                                node);
        if (!tags->static_rqs) {
                kfree(tags->rqs);
                blk_mq_free_tags(tags);
@@ -1704,6 +1756,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 {
        unsigned int i, j, entries_per_page, max_order = 4;
        size_t rq_size, left;
+       int node;
+
+       node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+       if (node == NUMA_NO_NODE)
+               node = set->numa_node;
 
        INIT_LIST_HEAD(&tags->page_list);
 
@@ -1725,7 +1782,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                        this_order--;
 
                do {
-                       page = alloc_pages_node(set->numa_node,
+                       page = alloc_pages_node(node,
                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
@@ -1758,7 +1815,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                        if (set->ops->init_request) {
                                if (set->ops->init_request(set->driver_data,
                                                rq, hctx_idx, i,
-                                               set->numa_node)) {
+                                               node)) {
                                        tags->static_rqs[i] = NULL;
                                        goto fail;
                                }