struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
unsigned int flags, unsigned int hctx_idx)
{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
+ struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
- struct blk_mq_alloc_data alloc_data;
+ unsigned int cpu;
int ret;
/*
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
- hctx = q->queue_hw_ctx[hctx_idx];
- if (!blk_mq_hw_queue_mapped(hctx)) {
- ret = -EXDEV;
- goto out_queue_exit;
+ alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
+ if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
+ blk_queue_exit(q);
+ return ERR_PTR(-EXDEV);
}
- ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+ cpu = cpumask_first(alloc_data.hctx->cpumask);
+ alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
- blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw);
- if (!rq) {
- ret = -EWOULDBLOCK;
- goto out_queue_exit;
- }
-
- return rq;
+ rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
-out_queue_exit:
+ blk_mq_put_ctx(alloc_data.ctx);
blk_queue_exit(q);
- return ERR_PTR(ret);
+
+ if (!rq)
+ return ERR_PTR(-EWOULDBLOCK);
+
+ return rq;
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
return true;
}
+ if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
+ data.flags |= BLK_MQ_REQ_RESERVED;
+
rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) {
if (blk_mq_tag_busy(data.hctx)) {
return first != NULL;
}
+static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags,
+ void *key)
+{
+ struct blk_mq_hw_ctx *hctx;
+
+ hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
+
+ list_del(&wait->task_list);
+ clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state);
+ blk_mq_run_hw_queue(hctx, true);
+ return 1;
+}
+
+static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
+{
+ struct sbq_wait_state *ws;
+
+ /*
+ * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait.
+ * The thread which wins the race to grab this bit adds the hardware
+ * queue to the wait queue.
+ */
+ if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) ||
+ test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state))
+ return false;
+
+ init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
+ ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx);
+
+ /*
+ * As soon as this returns, it's no longer safe to fiddle with
+ * hctx->dispatch_wait, since a completion can wake up the wait queue
+ * and unlock the bit.
+ */
+ add_wait_queue(&ws->wait, &hctx->dispatch_wait);
+ return true;
+}
+
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
continue;
/*
- * We failed getting a driver tag. Mark the queue(s)
- * as needing a restart. Retry getting a tag again,
- * in case the needed IO completed right before we
- * marked the queue as needing a restart.
+ * The initial allocation attempt failed, so we need to
+ * rerun the hardware queue when a tag is freed.
*/
- blk_mq_sched_mark_restart(hctx);
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
+ if (blk_mq_dispatch_wait_add(hctx)) {
+ /*
+ * It's possible that a tag was freed in the
+ * window between the allocation failure and
+ * adding the hardware queue to the wait queue.
+ */
+ if (!blk_mq_get_driver_tag(rq, &hctx, false))
+ break;
+ } else {
break;
+ }
}
+
list_del_init(&rq->queuelist);
bd.rq = rq;
*
* blk_mq_run_hw_queue() already checks the STOPPED bit
*
- * If RESTART is set, then let completion restart the queue
- * instead of potentially looping here.
+ * If RESTART or TAG_WAITING is set, then let completion restart
+ * the queue instead of potentially looping here.
*/
- if (!blk_mq_sched_needs_restart(hctx))
+ if (!blk_mq_sched_needs_restart(hctx) &&
+ !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
blk_mq_run_hw_queue(hctx, true);
}
unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
+ int node;
+
+ node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+ if (node == NUMA_NO_NODE)
+ node = set->numa_node;
- tags = blk_mq_init_tags(nr_tags, reserved_tags,
- set->numa_node,
+ tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
- set->numa_node);
+ node);
if (!tags->rqs) {
blk_mq_free_tags(tags);
return NULL;
tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
- set->numa_node);
+ node);
if (!tags->static_rqs) {
kfree(tags->rqs);
blk_mq_free_tags(tags);
{
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
+ int node;
+
+ node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+ if (node == NUMA_NO_NODE)
+ node = set->numa_node;
INIT_LIST_HEAD(&tags->page_list);
this_order--;
do {
- page = alloc_pages_node(set->numa_node,
+ page = alloc_pages_node(node,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order);
if (page)
if (set->ops->init_request) {
if (set->ops->init_request(set->driver_data,
rq, hctx_idx, i,
- set->numa_node)) {
+ node)) {
tags->static_rqs[i] = NULL;
goto fail;
}