return NULL;
}
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
- bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+ unsigned int flags)
{
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_alloc_data alloc_data;
int ret;
- ret = blk_queue_enter(q, gfp);
+ ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
if (ret)
return ERR_PTR(ret);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
- reserved, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
- if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
+ if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
__blk_mq_run_hw_queue(hctx);
blk_mq_put_ctx(ctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
- hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
}
}
EXPORT_SYMBOL(blk_mq_alloc_request);
+struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
+ unsigned int flags, unsigned int hctx_idx)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ struct request *rq;
+ struct blk_mq_alloc_data alloc_data;
+ int ret;
+
+ /*
+ * If the tag allocator sleeps we could get an allocation for a
+ * different hardware context. No need to complicate the low level
+ * allocator for this for the rare use case of a command tied to
+ * a specific queue.
+ */
+ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+ return ERR_PTR(-EINVAL);
+
+ if (hctx_idx >= q->nr_hw_queues)
+ return ERR_PTR(-EIO);
+
+ ret = blk_queue_enter(q, true);
+ if (ret)
+ return ERR_PTR(ret);
+
+ hctx = q->queue_hw_ctx[hctx_idx];
+ ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
+
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
+ if (!rq) {
+ blk_queue_exit(q);
+ return ERR_PTR(-EWOULDBLOCK);
+ }
+
+ return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
+
static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx, struct request *rq)
{
* If a request wasn't started before the queue was
* marked dying, kill it here or it'll go unnoticed.
*/
- if (unlikely(blk_queue_dying(rq->q)))
- blk_mq_complete_request(rq, -EIO);
+ if (unlikely(blk_queue_dying(rq->q))) {
+ rq->errors = -EIO;
+ blk_mq_end_request(rq, rq->errors);
+ }
return;
}
if (rq->cmd_flags & REQ_NO_TIMEOUT)
}
}
-static void blk_mq_rq_timer(unsigned long priv)
+static void blk_mq_timeout_work(struct work_struct *work)
{
- struct request_queue *q = (struct request_queue *)priv;
+ struct request_queue *q =
+ container_of(work, struct request_queue, timeout_work);
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
};
int i;
+ /* A deadlock might occur if a request is stuck requiring a
+ * timeout at the same time a queue freeze is waiting
+ * completion, since the timeout code would not be able to
+ * acquire the queue reference here.
+ *
+ * That's why we don't use blk_queue_enter here; instead, we use
+ * percpu_ref_tryget directly, because we need to be able to
+ * obtain a reference even in the short window between the queue
+ * starting to freeze, by dropping the first reference in
+ * blk_mq_freeze_queue_start, and the moment the last request is
+ * consumed, marked by the instant q_usage_counter reaches
+ * zero.
+ */
+ if (!percpu_ref_tryget(&q->q_usage_counter))
+ return;
+
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.next_set) {
blk_mq_tag_idle(hctx);
}
}
+ blk_queue_exit(q);
}
/*
struct list_head *dptr;
int queued;
- WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
-
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
return;
+ WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+ cpu_online(hctx->next_cpu));
+
hctx->run++;
/*
switch (ret) {
case BLK_MQ_RQ_QUEUE_OK:
queued++;
- continue;
+ break;
case BLK_MQ_RQ_QUEUE_BUSY:
list_add(&rq->queuelist, &rq_list);
__blk_mq_requeue_request(rq);
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
- int cpu = hctx->next_cpu, next_cpu;
+ int next_cpu;
next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids)
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
-
- return cpu;
}
return hctx->next_cpu;
EXPORT_SYMBOL(blk_mq_delay_queue);
static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx,
struct request *rq,
bool at_head)
{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+
trace_block_rq_insert(hctx->queue, rq);
if (at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
- __blk_mq_insert_req_list(hctx, ctx, rq, at_head);
+ __blk_mq_insert_req_list(hctx, rq, at_head);
blk_mq_hctx_mark_pending(hctx, ctx);
}
void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
- bool async)
+ bool async)
{
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
-
- current_ctx = blk_mq_get_ctx(q);
- if (!cpu_online(ctx->cpu))
- rq->mq_ctx = ctx = current_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
if (run_queue)
blk_mq_run_hw_queue(hctx, async);
-
- blk_mq_put_ctx(current_ctx);
}
static void blk_mq_insert_requests(struct request_queue *q,
{
struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *current_ctx;
trace_block_unplug(q, depth, !from_schedule);
- current_ctx = blk_mq_get_ctx(q);
-
- if (!cpu_online(ctx->cpu))
- ctx = current_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
/*
struct request *rq;
rq = list_first_entry(list, struct request, queuelist);
+ BUG_ON(rq->mq_ctx != ctx);
list_del_init(&rq->queuelist);
- rq->mq_ctx = ctx;
- __blk_mq_insert_req_list(hctx, ctx, rq, false);
+ __blk_mq_insert_req_list(hctx, rq, false);
}
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
blk_mq_run_hw_queue(hctx, from_schedule);
- blk_mq_put_ctx(current_ctx);
}
static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
rw |= REQ_SYNC;
trace_block_getrq(q, bio, rw);
- blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
- hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
if (unlikely(!rq)) {
__blk_mq_run_hw_queue(hctx);
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q,
- __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
ctx = alloc_data.ctx;
hctx = alloc_data.hctx;
blk_queue_bounce(q, &bio);
+ blk_queue_split(q, &bio, q->bio_split);
+
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_io_error(bio);
return BLK_QC_T_NONE;
}
- blk_queue_split(q, &bio, q->bio_split);
-
- if (!is_flush_fua && !blk_queue_nomerges(q)) {
- if (blk_attempt_plug_merge(q, bio, &request_count,
- &same_queue_rq))
- return BLK_QC_T_NONE;
- } else
- request_count = blk_plug_queued_count(q);
+ if (!is_flush_fua && !blk_queue_nomerges(q) &&
+ blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+ return BLK_QC_T_NONE;
rq = blk_mq_map_request(q, bio, &data);
if (unlikely(!rq))
blk_mq_put_ctx(data.ctx);
if (!old_rq)
goto done;
- if (!blk_mq_direct_issue_request(old_rq, &cookie))
- goto done;
- blk_mq_insert_request(old_rq, false, true, true);
+ if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) ||
+ blk_mq_direct_issue_request(old_rq, &cookie) != 0)
+ blk_mq_insert_request(old_rq, false, true, true);
goto done;
}
blk_queue_split(q, &bio, q->bio_split);
- if (!is_flush_fua && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return BLK_QC_T_NONE;
+ if (!is_flush_fua && !blk_queue_nomerges(q)) {
+ if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
+ return BLK_QC_T_NONE;
+ } else
+ request_count = blk_plug_queued_count(q);
rq = blk_mq_map_request(q, bio, &data);
if (unlikely(!rq))
INIT_LIST_HEAD(&tags->page_list);
tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
- GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
if (!tags->rqs) {
blk_mq_free_tags(tags);
do {
page = alloc_pages_node(set->numa_node,
- GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order);
if (page)
break;
* Allow kmemleak to scan these pages as they contain pointers
* to additional allocations like via ops->init_request().
*/
- kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
+ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
entries_per_page = order_to_size(this_order) / rq_size;
to_do = min(entries_per_page, set->queue_depth - i);
left -= to_do * rq_size;
return 0;
}
+/*
+ * 'cpu' is going away. splice any existing rq_list entries from this
+ * software queue to the hw queue dispatch list, and ensure that it
+ * gets run.
+ */
static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
{
- struct request_queue *q = hctx->queue;
struct blk_mq_ctx *ctx;
LIST_HEAD(tmp);
- /*
- * Move ctx entries to new CPU, if this one is going away.
- */
- ctx = __blk_mq_get_ctx(q, cpu);
+ ctx = __blk_mq_get_ctx(hctx->queue, cpu);
spin_lock(&ctx->lock);
if (!list_empty(&ctx->rq_list)) {
if (list_empty(&tmp))
return NOTIFY_OK;
- ctx = blk_mq_get_ctx(q);
- spin_lock(&ctx->lock);
-
- while (!list_empty(&tmp)) {
- struct request *rq;
-
- rq = list_first_entry(&tmp, struct request, queuelist);
- rq->mq_ctx = ctx;
- list_move_tail(&rq->queuelist, &ctx->rq_list);
- }
-
- hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_hctx_mark_pending(hctx, ctx);
-
- spin_unlock(&ctx->lock);
+ spin_lock(&hctx->lock);
+ list_splice_tail_init(&tmp, &hctx->dispatch);
+ spin_unlock(&hctx->lock);
blk_mq_run_hw_queue(hctx, true);
- blk_mq_put_ctx(ctx);
return NOTIFY_OK;
}
static void blk_mq_map_swqueue(struct request_queue *q,
const struct cpumask *online_mask)
{
- unsigned int i;
+ unsigned int i, hctx_idx;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
/*
* Map software to hardware queues
*/
- queue_for_each_ctx(q, ctx, i) {
+ for_each_possible_cpu(i) {
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpumask_test_cpu(i, online_mask))
continue;
+ hctx_idx = q->mq_map[i];
+ /* unmapped hw queue can be remapped after CPU topo changed */
+ if (!set->tags[hctx_idx]) {
+ set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
+
+ /*
+ * If tags initialization fail for some hctx,
+ * that hctx won't be brought online. In this
+ * case, remap the current ctx to hctx[0] which
+ * is guaranteed to always have tags allocated
+ */
+ if (!set->tags[hctx_idx])
+ q->mq_map[i] = 0;
+ }
+
+ ctx = per_cpu_ptr(q->queue_ctx, i);
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
ctx->index_hw = hctx->nr_ctx;
* disable it and free the request entries.
*/
if (!hctx->nr_ctx) {
- if (set->tags[i]) {
+ /* Never unmap queue 0. We need it as a
+ * fallback in case of a new remap fails
+ * allocation
+ */
+ if (i && set->tags[i]) {
blk_mq_free_rq_map(set, set->tags[i], i);
set->tags[i] = NULL;
}
continue;
}
- /* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[i])
- set->tags[i] = blk_mq_init_rq_map(set, i);
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
+ cpumask_copy(hctx->tags->cpumask, hctx->cpumask);
/*
* Set the map size to the number of mapped software queues.
* This is more accurate and more efficient than looping
hctx->next_cpu = cpumask_first(hctx->cpumask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
-
- queue_for_each_ctx(q, ctx, i) {
- if (!cpumask_test_cpu(i, online_mask))
- continue;
-
- hctx = q->mq_ops->map_queue(q, i);
- cpumask_set_cpu(i, hctx->tags->cpumask);
- }
}
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
hctxs[i]->queue_num = i;
}
- setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
+ INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->nr_queues = nr_cpu_ids;