]>
Commit | Line | Data |
---|---|---|
bd166ef1 JA |
1 | /* |
2 | * blk-mq scheduling framework | |
3 | * | |
4 | * Copyright (C) 2016 Jens Axboe | |
5 | */ | |
6 | #include <linux/kernel.h> | |
7 | #include <linux/module.h> | |
8 | #include <linux/blk-mq.h> | |
9 | ||
10 | #include <trace/events/block.h> | |
11 | ||
12 | #include "blk.h" | |
13 | #include "blk-mq.h" | |
14 | #include "blk-mq-sched.h" | |
15 | #include "blk-mq-tag.h" | |
16 | #include "blk-wbt.h" | |
17 | ||
18 | void blk_mq_sched_free_hctx_data(struct request_queue *q, | |
19 | void (*exit)(struct blk_mq_hw_ctx *)) | |
20 | { | |
21 | struct blk_mq_hw_ctx *hctx; | |
22 | int i; | |
23 | ||
24 | queue_for_each_hw_ctx(q, hctx, i) { | |
25 | if (exit && hctx->sched_data) | |
26 | exit(hctx); | |
27 | kfree(hctx->sched_data); | |
28 | hctx->sched_data = NULL; | |
29 | } | |
30 | } | |
31 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); | |
32 | ||
33 | int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, | |
34 | int (*init)(struct blk_mq_hw_ctx *), | |
35 | void (*exit)(struct blk_mq_hw_ctx *)) | |
36 | { | |
37 | struct blk_mq_hw_ctx *hctx; | |
38 | int ret; | |
39 | int i; | |
40 | ||
41 | queue_for_each_hw_ctx(q, hctx, i) { | |
42 | hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); | |
43 | if (!hctx->sched_data) { | |
44 | ret = -ENOMEM; | |
45 | goto error; | |
46 | } | |
47 | ||
48 | if (init) { | |
49 | ret = init(hctx); | |
50 | if (ret) { | |
51 | /* | |
52 | * We don't want to give exit() a partially | |
53 | * initialized sched_data. init() must clean up | |
54 | * if it fails. | |
55 | */ | |
56 | kfree(hctx->sched_data); | |
57 | hctx->sched_data = NULL; | |
58 | goto error; | |
59 | } | |
60 | } | |
61 | } | |
62 | ||
63 | return 0; | |
64 | error: | |
65 | blk_mq_sched_free_hctx_data(q, exit); | |
66 | return ret; | |
67 | } | |
68 | EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); | |
69 | ||
70 | static void __blk_mq_sched_assign_ioc(struct request_queue *q, | |
f1ba8261 PV |
71 | struct request *rq, |
72 | struct bio *bio, | |
73 | struct io_context *ioc) | |
bd166ef1 JA |
74 | { |
75 | struct io_cq *icq; | |
76 | ||
77 | spin_lock_irq(q->queue_lock); | |
78 | icq = ioc_lookup_icq(ioc, q); | |
79 | spin_unlock_irq(q->queue_lock); | |
80 | ||
81 | if (!icq) { | |
82 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); | |
83 | if (!icq) | |
84 | return; | |
85 | } | |
86 | ||
87 | rq->elv.icq = icq; | |
f1ba8261 | 88 | if (!blk_mq_sched_get_rq_priv(q, rq, bio)) { |
bd166ef1 JA |
89 | rq->rq_flags |= RQF_ELVPRIV; |
90 | get_io_context(icq->ioc); | |
91 | return; | |
92 | } | |
93 | ||
94 | rq->elv.icq = NULL; | |
95 | } | |
96 | ||
97 | static void blk_mq_sched_assign_ioc(struct request_queue *q, | |
98 | struct request *rq, struct bio *bio) | |
99 | { | |
100 | struct io_context *ioc; | |
101 | ||
102 | ioc = rq_ioc(bio); | |
103 | if (ioc) | |
f1ba8261 | 104 | __blk_mq_sched_assign_ioc(q, rq, bio, ioc); |
bd166ef1 JA |
105 | } |
106 | ||
107 | struct request *blk_mq_sched_get_request(struct request_queue *q, | |
108 | struct bio *bio, | |
109 | unsigned int op, | |
110 | struct blk_mq_alloc_data *data) | |
111 | { | |
112 | struct elevator_queue *e = q->elevator; | |
113 | struct blk_mq_hw_ctx *hctx; | |
114 | struct blk_mq_ctx *ctx; | |
115 | struct request *rq; | |
bd166ef1 JA |
116 | |
117 | blk_queue_enter_live(q); | |
118 | ctx = blk_mq_get_ctx(q); | |
119 | hctx = blk_mq_map_queue(q, ctx->cpu); | |
120 | ||
5a797e00 | 121 | blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); |
bd166ef1 JA |
122 | |
123 | if (e) { | |
124 | data->flags |= BLK_MQ_REQ_INTERNAL; | |
125 | ||
126 | /* | |
127 | * Flush requests are special and go directly to the | |
128 | * dispatch list. | |
129 | */ | |
f73f44eb | 130 | if (!op_is_flush(op) && e->type->ops.mq.get_request) { |
bd166ef1 JA |
131 | rq = e->type->ops.mq.get_request(q, op, data); |
132 | if (rq) | |
133 | rq->rq_flags |= RQF_QUEUED; | |
134 | } else | |
135 | rq = __blk_mq_alloc_request(data, op); | |
136 | } else { | |
137 | rq = __blk_mq_alloc_request(data, op); | |
b48fda09 JA |
138 | if (rq) |
139 | data->hctx->tags->rqs[rq->tag] = rq; | |
bd166ef1 JA |
140 | } |
141 | ||
142 | if (rq) { | |
f73f44eb | 143 | if (!op_is_flush(op)) { |
bd166ef1 JA |
144 | rq->elv.icq = NULL; |
145 | if (e && e->type->icq_cache) | |
146 | blk_mq_sched_assign_ioc(q, rq, bio); | |
147 | } | |
148 | data->hctx->queued++; | |
149 | return rq; | |
150 | } | |
151 | ||
152 | blk_queue_exit(q); | |
153 | return NULL; | |
154 | } | |
155 | ||
156 | void blk_mq_sched_put_request(struct request *rq) | |
157 | { | |
158 | struct request_queue *q = rq->q; | |
159 | struct elevator_queue *e = q->elevator; | |
160 | ||
161 | if (rq->rq_flags & RQF_ELVPRIV) { | |
162 | blk_mq_sched_put_rq_priv(rq->q, rq); | |
163 | if (rq->elv.icq) { | |
164 | put_io_context(rq->elv.icq->ioc); | |
165 | rq->elv.icq = NULL; | |
166 | } | |
167 | } | |
168 | ||
169 | if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) | |
170 | e->type->ops.mq.put_request(rq); | |
171 | else | |
172 | blk_mq_finish_request(rq); | |
173 | } | |
174 | ||
175 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |
176 | { | |
177 | struct elevator_queue *e = hctx->queue->elevator; | |
64765a75 JA |
178 | const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; |
179 | bool did_work = false; | |
bd166ef1 JA |
180 | LIST_HEAD(rq_list); |
181 | ||
182 | if (unlikely(blk_mq_hctx_stopped(hctx))) | |
183 | return; | |
184 | ||
185 | hctx->run++; | |
186 | ||
187 | /* | |
188 | * If we have previous entries on our dispatch list, grab them first for | |
189 | * more fair dispatch. | |
190 | */ | |
191 | if (!list_empty_careful(&hctx->dispatch)) { | |
192 | spin_lock(&hctx->lock); | |
193 | if (!list_empty(&hctx->dispatch)) | |
194 | list_splice_init(&hctx->dispatch, &rq_list); | |
195 | spin_unlock(&hctx->lock); | |
196 | } | |
197 | ||
198 | /* | |
199 | * Only ask the scheduler for requests, if we didn't have residual | |
200 | * requests from the dispatch list. This is to avoid the case where | |
201 | * we only ever dispatch a fraction of the requests available because | |
202 | * of low device queue depth. Once we pull requests out of the IO | |
203 | * scheduler, we can no longer merge or sort them. So it's best to | |
204 | * leave them there for as long as we can. Mark the hw queue as | |
205 | * needing a restart in that case. | |
206 | */ | |
c13660a0 | 207 | if (!list_empty(&rq_list)) { |
d38d3515 | 208 | blk_mq_sched_mark_restart_hctx(hctx); |
64765a75 JA |
209 | did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); |
210 | } else if (!has_sched_dispatch) { | |
c13660a0 JA |
211 | blk_mq_flush_busy_ctxs(hctx, &rq_list); |
212 | blk_mq_dispatch_rq_list(hctx, &rq_list); | |
64765a75 JA |
213 | } |
214 | ||
215 | /* | |
216 | * We want to dispatch from the scheduler if we had no work left | |
217 | * on the dispatch list, OR if we did have work but weren't able | |
218 | * to make progress. | |
219 | */ | |
220 | if (!did_work && has_sched_dispatch) { | |
c13660a0 JA |
221 | do { |
222 | struct request *rq; | |
223 | ||
224 | rq = e->type->ops.mq.dispatch_request(hctx); | |
225 | if (!rq) | |
226 | break; | |
227 | list_add(&rq->queuelist, &rq_list); | |
228 | } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); | |
229 | } | |
bd166ef1 JA |
230 | } |
231 | ||
232 | void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, | |
233 | struct list_head *rq_list, | |
234 | struct request *(*get_rq)(struct blk_mq_hw_ctx *)) | |
235 | { | |
236 | do { | |
237 | struct request *rq; | |
238 | ||
239 | rq = get_rq(hctx); | |
240 | if (!rq) | |
241 | break; | |
242 | ||
243 | list_add_tail(&rq->queuelist, rq_list); | |
244 | } while (1); | |
245 | } | |
246 | EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); | |
247 | ||
e4d750c9 JA |
248 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, |
249 | struct request **merged_request) | |
bd166ef1 JA |
250 | { |
251 | struct request *rq; | |
bd166ef1 | 252 | |
34fe7c05 CH |
253 | switch (elv_merge(q, &rq, bio)) { |
254 | case ELEVATOR_BACK_MERGE: | |
bd166ef1 JA |
255 | if (!blk_mq_sched_allow_merge(q, rq, bio)) |
256 | return false; | |
34fe7c05 CH |
257 | if (!bio_attempt_back_merge(q, rq, bio)) |
258 | return false; | |
259 | *merged_request = attempt_back_merge(q, rq); | |
260 | if (!*merged_request) | |
261 | elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); | |
262 | return true; | |
263 | case ELEVATOR_FRONT_MERGE: | |
bd166ef1 JA |
264 | if (!blk_mq_sched_allow_merge(q, rq, bio)) |
265 | return false; | |
34fe7c05 CH |
266 | if (!bio_attempt_front_merge(q, rq, bio)) |
267 | return false; | |
268 | *merged_request = attempt_front_merge(q, rq); | |
269 | if (!*merged_request) | |
270 | elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); | |
271 | return true; | |
272 | default: | |
273 | return false; | |
bd166ef1 | 274 | } |
bd166ef1 JA |
275 | } |
276 | EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); | |
277 | ||
278 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | |
279 | { | |
280 | struct elevator_queue *e = q->elevator; | |
281 | ||
282 | if (e->type->ops.mq.bio_merge) { | |
283 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | |
284 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | |
285 | ||
286 | blk_mq_put_ctx(ctx); | |
287 | return e->type->ops.mq.bio_merge(hctx, bio); | |
288 | } | |
289 | ||
290 | return false; | |
291 | } | |
292 | ||
293 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) | |
294 | { | |
295 | return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); | |
296 | } | |
297 | EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); | |
298 | ||
299 | void blk_mq_sched_request_inserted(struct request *rq) | |
300 | { | |
301 | trace_block_rq_insert(rq->q, rq); | |
302 | } | |
303 | EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); | |
304 | ||
0cacba6c OS |
305 | static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, |
306 | struct request *rq) | |
bd166ef1 JA |
307 | { |
308 | if (rq->tag == -1) { | |
309 | rq->rq_flags |= RQF_SORTED; | |
310 | return false; | |
311 | } | |
312 | ||
313 | /* | |
314 | * If we already have a real request tag, send directly to | |
315 | * the dispatch list. | |
316 | */ | |
317 | spin_lock(&hctx->lock); | |
318 | list_add(&rq->queuelist, &hctx->dispatch); | |
319 | spin_unlock(&hctx->lock); | |
320 | return true; | |
321 | } | |
bd166ef1 | 322 | |
50e1dab8 JA |
323 | static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) |
324 | { | |
325 | if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { | |
326 | clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); | |
327 | if (blk_mq_hctx_has_pending(hctx)) | |
328 | blk_mq_run_hw_queue(hctx, true); | |
329 | } | |
330 | } | |
331 | ||
332 | void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) | |
333 | { | |
d38d3515 | 334 | struct request_queue *q = hctx->queue; |
50e1dab8 JA |
335 | unsigned int i; |
336 | ||
d38d3515 OS |
337 | if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { |
338 | if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { | |
339 | queue_for_each_hw_ctx(q, hctx, i) | |
340 | blk_mq_sched_restart_hctx(hctx); | |
341 | } | |
342 | } else { | |
50e1dab8 | 343 | blk_mq_sched_restart_hctx(hctx); |
50e1dab8 JA |
344 | } |
345 | } | |
346 | ||
bd6737f1 JA |
347 | /* |
348 | * Add flush/fua to the queue. If we fail getting a driver tag, then | |
349 | * punt to the requeue list. Requeue will re-invoke us from a context | |
350 | * that's safe to block from. | |
351 | */ | |
352 | static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, | |
353 | struct request *rq, bool can_block) | |
354 | { | |
355 | if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { | |
356 | blk_insert_flush(rq); | |
357 | blk_mq_run_hw_queue(hctx, true); | |
358 | } else | |
c7a571b4 | 359 | blk_mq_add_to_requeue_list(rq, false, true); |
bd6737f1 JA |
360 | } |
361 | ||
362 | void blk_mq_sched_insert_request(struct request *rq, bool at_head, | |
363 | bool run_queue, bool async, bool can_block) | |
364 | { | |
365 | struct request_queue *q = rq->q; | |
366 | struct elevator_queue *e = q->elevator; | |
367 | struct blk_mq_ctx *ctx = rq->mq_ctx; | |
368 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | |
369 | ||
f3a8ab7d | 370 | if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { |
bd6737f1 JA |
371 | blk_mq_sched_insert_flush(hctx, rq, can_block); |
372 | return; | |
373 | } | |
374 | ||
0cacba6c OS |
375 | if (e && blk_mq_sched_bypass_insert(hctx, rq)) |
376 | goto run; | |
377 | ||
bd6737f1 JA |
378 | if (e && e->type->ops.mq.insert_requests) { |
379 | LIST_HEAD(list); | |
380 | ||
381 | list_add(&rq->queuelist, &list); | |
382 | e->type->ops.mq.insert_requests(hctx, &list, at_head); | |
383 | } else { | |
384 | spin_lock(&ctx->lock); | |
385 | __blk_mq_insert_request(hctx, rq, at_head); | |
386 | spin_unlock(&ctx->lock); | |
387 | } | |
388 | ||
0cacba6c | 389 | run: |
bd6737f1 JA |
390 | if (run_queue) |
391 | blk_mq_run_hw_queue(hctx, async); | |
392 | } | |
393 | ||
394 | void blk_mq_sched_insert_requests(struct request_queue *q, | |
395 | struct blk_mq_ctx *ctx, | |
396 | struct list_head *list, bool run_queue_async) | |
397 | { | |
398 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | |
399 | struct elevator_queue *e = hctx->queue->elevator; | |
400 | ||
0cacba6c OS |
401 | if (e) { |
402 | struct request *rq, *next; | |
403 | ||
404 | /* | |
405 | * We bypass requests that already have a driver tag assigned, | |
406 | * which should only be flushes. Flushes are only ever inserted | |
407 | * as single requests, so we shouldn't ever hit the | |
408 | * WARN_ON_ONCE() below (but let's handle it just in case). | |
409 | */ | |
410 | list_for_each_entry_safe(rq, next, list, queuelist) { | |
411 | if (WARN_ON_ONCE(rq->tag != -1)) { | |
412 | list_del_init(&rq->queuelist); | |
413 | blk_mq_sched_bypass_insert(hctx, rq); | |
414 | } | |
415 | } | |
416 | } | |
417 | ||
bd6737f1 JA |
418 | if (e && e->type->ops.mq.insert_requests) |
419 | e->type->ops.mq.insert_requests(hctx, list, false); | |
420 | else | |
421 | blk_mq_insert_requests(hctx, ctx, list); | |
422 | ||
423 | blk_mq_run_hw_queue(hctx, run_queue_async); | |
424 | } | |
425 | ||
bd166ef1 JA |
426 | static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, |
427 | struct blk_mq_hw_ctx *hctx, | |
428 | unsigned int hctx_idx) | |
429 | { | |
430 | if (hctx->sched_tags) { | |
431 | blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); | |
432 | blk_mq_free_rq_map(hctx->sched_tags); | |
433 | hctx->sched_tags = NULL; | |
434 | } | |
435 | } | |
436 | ||
437 | int blk_mq_sched_setup(struct request_queue *q) | |
438 | { | |
439 | struct blk_mq_tag_set *set = q->tag_set; | |
440 | struct blk_mq_hw_ctx *hctx; | |
441 | int ret, i; | |
442 | ||
443 | /* | |
444 | * Default to 256, since we don't split into sync/async like the | |
445 | * old code did. Additionally, this is a per-hw queue depth. | |
446 | */ | |
447 | q->nr_requests = 2 * BLKDEV_MAX_RQ; | |
448 | ||
449 | /* | |
450 | * We're switching to using an IO scheduler, so setup the hctx | |
451 | * scheduler tags and switch the request map from the regular | |
452 | * tags to scheduler tags. First allocate what we need, so we | |
453 | * can safely fail and fallback, if needed. | |
454 | */ | |
455 | ret = 0; | |
456 | queue_for_each_hw_ctx(q, hctx, i) { | |
415b806d SG |
457 | hctx->sched_tags = blk_mq_alloc_rq_map(set, i, |
458 | q->nr_requests, set->reserved_tags); | |
bd166ef1 JA |
459 | if (!hctx->sched_tags) { |
460 | ret = -ENOMEM; | |
461 | break; | |
462 | } | |
463 | ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); | |
464 | if (ret) | |
465 | break; | |
466 | } | |
467 | ||
468 | /* | |
469 | * If we failed, free what we did allocate | |
470 | */ | |
471 | if (ret) { | |
472 | queue_for_each_hw_ctx(q, hctx, i) { | |
473 | if (!hctx->sched_tags) | |
474 | continue; | |
475 | blk_mq_sched_free_tags(set, hctx, i); | |
476 | } | |
477 | ||
478 | return ret; | |
479 | } | |
480 | ||
481 | return 0; | |
482 | } | |
483 | ||
484 | void blk_mq_sched_teardown(struct request_queue *q) | |
485 | { | |
486 | struct blk_mq_tag_set *set = q->tag_set; | |
487 | struct blk_mq_hw_ctx *hctx; | |
488 | int i; | |
489 | ||
490 | queue_for_each_hw_ctx(q, hctx, i) | |
491 | blk_mq_sched_free_tags(set, hctx, i); | |
492 | } | |
d3484991 JA |
493 | |
494 | int blk_mq_sched_init(struct request_queue *q) | |
495 | { | |
496 | int ret; | |
497 | ||
d3484991 JA |
498 | mutex_lock(&q->sysfs_lock); |
499 | ret = elevator_init(q, NULL); | |
500 | mutex_unlock(&q->sysfs_lock); | |
501 | ||
502 | return ret; | |
503 | } |