put_cpu();
}
+static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ rcu_read_unlock();
+ else
+ srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
+}
+
+static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ rcu_read_lock();
+ else
+ *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
+}
+
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, false);
else
- blk_mq_sched_insert_request(rq, true, false, false, true);
+ blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, false, false, false, true);
+ blk_mq_sched_insert_request(rq, false, false, false);
}
blk_mq_run_hw_queues(q, false);
}
}
+#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8
+#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4
+/*
+ * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
+ * - EWMA is one simple way to compute running average value
+ * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
+ * - take 4 as factor for avoiding to get too small(0) result, and this
+ * factor doesn't matter because EWMA decreases exponentially
+ */
+static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
+{
+ unsigned int ewma;
+
+ if (hctx->queue->elevator)
+ return;
+
+ ewma = hctx->dispatch_busy;
+
+ if (!ewma && !busy)
+ return;
+
+ ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
+ if (busy)
+ ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
+ ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;
+
+ hctx->dispatch_busy = ewma;
+}
+
+#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
+
+/*
+ * Returns true if we did some work AND can potentially do more.
+ */
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool got_budget)
{
struct request *rq, *nxt;
bool no_tag = false;
int errors, queued;
+ blk_status_t ret = BLK_STS_OK;
if (list_empty(list))
return false;
errors = queued = 0;
do {
struct blk_mq_queue_data bd;
- blk_status_t ret;
rq = list_first_entry(list, struct request, queuelist);
}
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_STS_RESOURCE) {
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
/*
* If an I/O scheduler has been configured and we got a
* driver tag for the next request already, free it
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
+ bool needs_restart;
+
spin_lock(&hctx->lock);
list_splice_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
* - Some but not all block drivers stop a queue before
* returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
* and dm-rq.
+ *
+ * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
+ * bit is set, run queue after a delay to avoid IO stalls
+ * that could otherwise occur if the queue is idle.
*/
- if (!blk_mq_sched_needs_restart(hctx) ||
+ needs_restart = blk_mq_sched_needs_restart(hctx);
+ if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
- }
+ else if (needs_restart && (ret == BLK_STS_RESOURCE))
+ blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
+
+ blk_mq_update_dispatch_busy(hctx, true);
+ return false;
+ } else
+ blk_mq_update_dispatch_busy(hctx, false);
+
+ /*
+ * If the host/device is unable to accept more work, inform the
+ * caller of that.
+ */
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
+ return false;
return (queued + errors) != 0;
}
*/
WARN_ON_ONCE(in_interrupt());
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- blk_mq_sched_dispatch_requests(hctx);
- rcu_read_unlock();
- } else {
- might_sleep();
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
- blk_mq_sched_dispatch_requests(hctx);
- srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
- }
+ hctx_lock(hctx, &srcu_idx);
+ blk_mq_sched_dispatch_requests(hctx);
+ hctx_unlock(hctx, srcu_idx);
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- if (blk_mq_hctx_has_pending(hctx)) {
+ int srcu_idx;
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ hctx_lock(hctx, &srcu_idx);
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx);
+ hctx_unlock(hctx, srcu_idx);
+
+ if (need_run) {
__blk_mq_delay_run_hw_queue(hctx, async, 0);
return true;
}
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie, bool may_sleep)
+static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
};
blk_qc_t new_cookie;
blk_status_t ret;
+
+ new_cookie = request_to_qc_t(hctx, rq);
+
+ /*
+ * For OK queue, we are done. For error, caller may kill it.
+ * Any other error (busy), just add it to our list as we
+ * previously would have done.
+ */
+ ret = q->mq_ops->queue_rq(hctx, &bd);
+ switch (ret) {
+ case BLK_STS_OK:
+ blk_mq_update_dispatch_busy(hctx, false);
+ *cookie = new_cookie;
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_update_dispatch_busy(hctx, true);
+ __blk_mq_requeue_request(rq);
+ break;
+ default:
+ blk_mq_update_dispatch_busy(hctx, false);
+ *cookie = BLK_QC_T_NONE;
+ break;
+ }
+
+ return ret;
+}
+
+static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie,
+ bool bypass_insert)
+{
+ struct request_queue *q = rq->q;
bool run_queue = true;
- /* RCU or SRCU read lock is needed before checking quiesced flag */
+ /*
+ * RCU or SRCU read lock is needed before checking quiesced flag.
+ *
+ * When queue is stopped or quiesced, ignore 'bypass_insert' from
+ * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+ * and avoid driver to try to dispatch again.
+ */
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
+ bypass_insert = false;
goto insert;
}
- if (q->elevator)
+ if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_dispatch_budget(hctx))
goto insert;
}
- new_cookie = request_to_qc_t(hctx, rq);
-
- /*
- * For OK queue, we are done. For error, kill it. Any other
- * error (busy), just add it to our list as we previously
- * would have done
- */
- ret = q->mq_ops->queue_rq(hctx, &bd);
- switch (ret) {
- case BLK_STS_OK:
- *cookie = new_cookie;
- return;
- case BLK_STS_RESOURCE:
- __blk_mq_requeue_request(rq);
- goto insert;
- default:
- *cookie = BLK_QC_T_NONE;
- blk_mq_end_request(rq, ret);
- return;
- }
-
+ return __blk_mq_issue_directly(hctx, rq, cookie);
insert:
- blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+ if (bypass_insert)
+ return BLK_STS_RESOURCE;
+
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
+ return BLK_STS_OK;
}
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- __blk_mq_try_issue_directly(hctx, rq, cookie, false);
- rcu_read_unlock();
- } else {
- unsigned int srcu_idx;
+ blk_status_t ret;
+ int srcu_idx;
- might_sleep();
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
- __blk_mq_try_issue_directly(hctx, rq, cookie, true);
- srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
+ hctx_lock(hctx, &srcu_idx);
+
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
+ blk_mq_sched_insert_request(rq, false, true, false);
+ else if (ret != BLK_STS_OK)
+ blk_mq_end_request(rq, ret);
+
+ hctx_unlock(hctx, srcu_idx);
+}
+
+blk_status_t blk_mq_request_issue_directly(struct request *rq)
+{
+ blk_status_t ret;
+ int srcu_idx;
+ blk_qc_t unused_cookie;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+ hctx_lock(hctx, &srcu_idx);
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+ hctx_unlock(hctx, srcu_idx);
+
+ return ret;
+}
+
+void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list)
+{
+ while (!list_empty(list)) {
+ blk_status_t ret;
+ struct request *rq = list_first_entry(list, struct request,
+ queuelist);
+
+ list_del_init(&rq->queuelist);
+ ret = blk_mq_request_issue_directly(rq);
+ if (ret != BLK_STS_OK) {
+ list_add(&rq->queuelist, list);
+ break;
+ }
}
}
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
- } else if (q->nr_hw_queues > 1 && is_sync) {
+ } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
+ !data.hctx->dispatch_busy)) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
} else if (q->elevator) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true, true, true);
+ blk_mq_sched_insert_request(rq, false, true, true);
} else {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->queue_rq_srcu);
-
blk_mq_remove_cpuhp(hctx);
- blk_free_flush_queue(hctx->fq);
- sbitmap_free(&hctx->ctx_map);
}
static void blk_mq_exit_hw_queues(struct request_queue *q,
* runtime
*/
hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
- GFP_KERNEL, node);
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
if (!hctx->ctxs)
goto unregister_cpu_notifier;
- if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
- node))
+ if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
goto free_ctxs;
hctx->nr_ctx = 0;
if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
goto exit_hctx;
- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
+ hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!hctx->fq)
goto sched_exit_hctx;
struct blk_mq_hw_ctx *hctx;
unsigned int i;
- cancel_delayed_work_sync(&q->requeue_work);
-
/* hctx kobj stays in hctx */
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx)
node = blk_mq_hw_queue_to_node(q->mq_map, i);
hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
- GFP_KERNEL, node);
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+ node);
if (!hctxs[i])
break;
- if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
- node)) {
+ if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask,
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+ node)) {
kfree(hctxs[i]);
hctxs[i] = NULL;
break;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
-void blk_mq_free_queue(struct request_queue *q)
+/* tags can _not_ be used after returning from blk_mq_exit_queue */
+void blk_mq_exit_queue(struct request_queue *q)
{
struct blk_mq_tag_set *set = q->tag_set;
return 0;
blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!ret)
q->nr_requests = nr;
+ blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;