]>
Commit | Line | Data |
---|---|---|
bd166ef1 JA |
1 | /* |
2 | * blk-mq scheduling framework | |
3 | * | |
4 | * Copyright (C) 2016 Jens Axboe | |
5 | */ | |
6 | #include <linux/kernel.h> | |
7 | #include <linux/module.h> | |
8 | #include <linux/blk-mq.h> | |
9 | ||
10 | #include <trace/events/block.h> | |
11 | ||
12 | #include "blk.h" | |
13 | #include "blk-mq.h" | |
14 | #include "blk-mq-sched.h" | |
15 | #include "blk-mq-tag.h" | |
16 | #include "blk-wbt.h" | |
17 | ||
18 | void blk_mq_sched_free_hctx_data(struct request_queue *q, | |
19 | void (*exit)(struct blk_mq_hw_ctx *)) | |
20 | { | |
21 | struct blk_mq_hw_ctx *hctx; | |
22 | int i; | |
23 | ||
24 | queue_for_each_hw_ctx(q, hctx, i) { | |
25 | if (exit && hctx->sched_data) | |
26 | exit(hctx); | |
27 | kfree(hctx->sched_data); | |
28 | hctx->sched_data = NULL; | |
29 | } | |
30 | } | |
31 | EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); | |
32 | ||
33 | int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size, | |
34 | int (*init)(struct blk_mq_hw_ctx *), | |
35 | void (*exit)(struct blk_mq_hw_ctx *)) | |
36 | { | |
37 | struct blk_mq_hw_ctx *hctx; | |
38 | int ret; | |
39 | int i; | |
40 | ||
41 | queue_for_each_hw_ctx(q, hctx, i) { | |
42 | hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node); | |
43 | if (!hctx->sched_data) { | |
44 | ret = -ENOMEM; | |
45 | goto error; | |
46 | } | |
47 | ||
48 | if (init) { | |
49 | ret = init(hctx); | |
50 | if (ret) { | |
51 | /* | |
52 | * We don't want to give exit() a partially | |
53 | * initialized sched_data. init() must clean up | |
54 | * if it fails. | |
55 | */ | |
56 | kfree(hctx->sched_data); | |
57 | hctx->sched_data = NULL; | |
58 | goto error; | |
59 | } | |
60 | } | |
61 | } | |
62 | ||
63 | return 0; | |
64 | error: | |
65 | blk_mq_sched_free_hctx_data(q, exit); | |
66 | return ret; | |
67 | } | |
68 | EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data); | |
69 | ||
70 | static void __blk_mq_sched_assign_ioc(struct request_queue *q, | |
71 | struct request *rq, struct io_context *ioc) | |
72 | { | |
73 | struct io_cq *icq; | |
74 | ||
75 | spin_lock_irq(q->queue_lock); | |
76 | icq = ioc_lookup_icq(ioc, q); | |
77 | spin_unlock_irq(q->queue_lock); | |
78 | ||
79 | if (!icq) { | |
80 | icq = ioc_create_icq(ioc, q, GFP_ATOMIC); | |
81 | if (!icq) | |
82 | return; | |
83 | } | |
84 | ||
85 | rq->elv.icq = icq; | |
86 | if (!blk_mq_sched_get_rq_priv(q, rq)) { | |
87 | rq->rq_flags |= RQF_ELVPRIV; | |
88 | get_io_context(icq->ioc); | |
89 | return; | |
90 | } | |
91 | ||
92 | rq->elv.icq = NULL; | |
93 | } | |
94 | ||
95 | static void blk_mq_sched_assign_ioc(struct request_queue *q, | |
96 | struct request *rq, struct bio *bio) | |
97 | { | |
98 | struct io_context *ioc; | |
99 | ||
100 | ioc = rq_ioc(bio); | |
101 | if (ioc) | |
102 | __blk_mq_sched_assign_ioc(q, rq, ioc); | |
103 | } | |
104 | ||
105 | struct request *blk_mq_sched_get_request(struct request_queue *q, | |
106 | struct bio *bio, | |
107 | unsigned int op, | |
108 | struct blk_mq_alloc_data *data) | |
109 | { | |
110 | struct elevator_queue *e = q->elevator; | |
111 | struct blk_mq_hw_ctx *hctx; | |
112 | struct blk_mq_ctx *ctx; | |
113 | struct request *rq; | |
114 | const bool is_flush = op & (REQ_PREFLUSH | REQ_FUA); | |
115 | ||
116 | blk_queue_enter_live(q); | |
117 | ctx = blk_mq_get_ctx(q); | |
118 | hctx = blk_mq_map_queue(q, ctx->cpu); | |
119 | ||
5a797e00 | 120 | blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); |
bd166ef1 JA |
121 | |
122 | if (e) { | |
123 | data->flags |= BLK_MQ_REQ_INTERNAL; | |
124 | ||
125 | /* | |
126 | * Flush requests are special and go directly to the | |
127 | * dispatch list. | |
128 | */ | |
129 | if (!is_flush && e->type->ops.mq.get_request) { | |
130 | rq = e->type->ops.mq.get_request(q, op, data); | |
131 | if (rq) | |
132 | rq->rq_flags |= RQF_QUEUED; | |
133 | } else | |
134 | rq = __blk_mq_alloc_request(data, op); | |
135 | } else { | |
136 | rq = __blk_mq_alloc_request(data, op); | |
137 | data->hctx->tags->rqs[rq->tag] = rq; | |
138 | } | |
139 | ||
140 | if (rq) { | |
141 | if (!is_flush) { | |
142 | rq->elv.icq = NULL; | |
143 | if (e && e->type->icq_cache) | |
144 | blk_mq_sched_assign_ioc(q, rq, bio); | |
145 | } | |
146 | data->hctx->queued++; | |
147 | return rq; | |
148 | } | |
149 | ||
150 | blk_queue_exit(q); | |
151 | return NULL; | |
152 | } | |
153 | ||
154 | void blk_mq_sched_put_request(struct request *rq) | |
155 | { | |
156 | struct request_queue *q = rq->q; | |
157 | struct elevator_queue *e = q->elevator; | |
158 | ||
159 | if (rq->rq_flags & RQF_ELVPRIV) { | |
160 | blk_mq_sched_put_rq_priv(rq->q, rq); | |
161 | if (rq->elv.icq) { | |
162 | put_io_context(rq->elv.icq->ioc); | |
163 | rq->elv.icq = NULL; | |
164 | } | |
165 | } | |
166 | ||
167 | if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request) | |
168 | e->type->ops.mq.put_request(rq); | |
169 | else | |
170 | blk_mq_finish_request(rq); | |
171 | } | |
172 | ||
173 | void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) | |
174 | { | |
175 | struct elevator_queue *e = hctx->queue->elevator; | |
176 | LIST_HEAD(rq_list); | |
177 | ||
178 | if (unlikely(blk_mq_hctx_stopped(hctx))) | |
179 | return; | |
180 | ||
181 | hctx->run++; | |
182 | ||
183 | /* | |
184 | * If we have previous entries on our dispatch list, grab them first for | |
185 | * more fair dispatch. | |
186 | */ | |
187 | if (!list_empty_careful(&hctx->dispatch)) { | |
188 | spin_lock(&hctx->lock); | |
189 | if (!list_empty(&hctx->dispatch)) | |
190 | list_splice_init(&hctx->dispatch, &rq_list); | |
191 | spin_unlock(&hctx->lock); | |
192 | } | |
193 | ||
194 | /* | |
195 | * Only ask the scheduler for requests, if we didn't have residual | |
196 | * requests from the dispatch list. This is to avoid the case where | |
197 | * we only ever dispatch a fraction of the requests available because | |
198 | * of low device queue depth. Once we pull requests out of the IO | |
199 | * scheduler, we can no longer merge or sort them. So it's best to | |
200 | * leave them there for as long as we can. Mark the hw queue as | |
201 | * needing a restart in that case. | |
202 | */ | |
203 | if (list_empty(&rq_list)) { | |
204 | if (e && e->type->ops.mq.dispatch_requests) | |
205 | e->type->ops.mq.dispatch_requests(hctx, &rq_list); | |
206 | else | |
207 | blk_mq_flush_busy_ctxs(hctx, &rq_list); | |
208 | } else | |
209 | blk_mq_sched_mark_restart(hctx); | |
210 | ||
211 | blk_mq_dispatch_rq_list(hctx, &rq_list); | |
212 | } | |
213 | ||
214 | void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, | |
215 | struct list_head *rq_list, | |
216 | struct request *(*get_rq)(struct blk_mq_hw_ctx *)) | |
217 | { | |
218 | do { | |
219 | struct request *rq; | |
220 | ||
221 | rq = get_rq(hctx); | |
222 | if (!rq) | |
223 | break; | |
224 | ||
225 | list_add_tail(&rq->queuelist, rq_list); | |
226 | } while (1); | |
227 | } | |
228 | EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch); | |
229 | ||
230 | bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio) | |
231 | { | |
232 | struct request *rq; | |
233 | int ret; | |
234 | ||
235 | ret = elv_merge(q, &rq, bio); | |
236 | if (ret == ELEVATOR_BACK_MERGE) { | |
237 | if (!blk_mq_sched_allow_merge(q, rq, bio)) | |
238 | return false; | |
239 | if (bio_attempt_back_merge(q, rq, bio)) { | |
240 | if (!attempt_back_merge(q, rq)) | |
241 | elv_merged_request(q, rq, ret); | |
242 | return true; | |
243 | } | |
244 | } else if (ret == ELEVATOR_FRONT_MERGE) { | |
245 | if (!blk_mq_sched_allow_merge(q, rq, bio)) | |
246 | return false; | |
247 | if (bio_attempt_front_merge(q, rq, bio)) { | |
248 | if (!attempt_front_merge(q, rq)) | |
249 | elv_merged_request(q, rq, ret); | |
250 | return true; | |
251 | } | |
252 | } | |
253 | ||
254 | return false; | |
255 | } | |
256 | EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); | |
257 | ||
258 | bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) | |
259 | { | |
260 | struct elevator_queue *e = q->elevator; | |
261 | ||
262 | if (e->type->ops.mq.bio_merge) { | |
263 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); | |
264 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); | |
265 | ||
266 | blk_mq_put_ctx(ctx); | |
267 | return e->type->ops.mq.bio_merge(hctx, bio); | |
268 | } | |
269 | ||
270 | return false; | |
271 | } | |
272 | ||
273 | bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) | |
274 | { | |
275 | return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); | |
276 | } | |
277 | EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); | |
278 | ||
279 | void blk_mq_sched_request_inserted(struct request *rq) | |
280 | { | |
281 | trace_block_rq_insert(rq->q, rq); | |
282 | } | |
283 | EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); | |
284 | ||
285 | bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq) | |
286 | { | |
287 | if (rq->tag == -1) { | |
288 | rq->rq_flags |= RQF_SORTED; | |
289 | return false; | |
290 | } | |
291 | ||
292 | /* | |
293 | * If we already have a real request tag, send directly to | |
294 | * the dispatch list. | |
295 | */ | |
296 | spin_lock(&hctx->lock); | |
297 | list_add(&rq->queuelist, &hctx->dispatch); | |
298 | spin_unlock(&hctx->lock); | |
299 | return true; | |
300 | } | |
301 | EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert); | |
302 | ||
303 | static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, | |
304 | struct blk_mq_hw_ctx *hctx, | |
305 | unsigned int hctx_idx) | |
306 | { | |
307 | if (hctx->sched_tags) { | |
308 | blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); | |
309 | blk_mq_free_rq_map(hctx->sched_tags); | |
310 | hctx->sched_tags = NULL; | |
311 | } | |
312 | } | |
313 | ||
314 | int blk_mq_sched_setup(struct request_queue *q) | |
315 | { | |
316 | struct blk_mq_tag_set *set = q->tag_set; | |
317 | struct blk_mq_hw_ctx *hctx; | |
318 | int ret, i; | |
319 | ||
320 | /* | |
321 | * Default to 256, since we don't split into sync/async like the | |
322 | * old code did. Additionally, this is a per-hw queue depth. | |
323 | */ | |
324 | q->nr_requests = 2 * BLKDEV_MAX_RQ; | |
325 | ||
326 | /* | |
327 | * We're switching to using an IO scheduler, so setup the hctx | |
328 | * scheduler tags and switch the request map from the regular | |
329 | * tags to scheduler tags. First allocate what we need, so we | |
330 | * can safely fail and fallback, if needed. | |
331 | */ | |
332 | ret = 0; | |
333 | queue_for_each_hw_ctx(q, hctx, i) { | |
334 | hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); | |
335 | if (!hctx->sched_tags) { | |
336 | ret = -ENOMEM; | |
337 | break; | |
338 | } | |
339 | ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); | |
340 | if (ret) | |
341 | break; | |
342 | } | |
343 | ||
344 | /* | |
345 | * If we failed, free what we did allocate | |
346 | */ | |
347 | if (ret) { | |
348 | queue_for_each_hw_ctx(q, hctx, i) { | |
349 | if (!hctx->sched_tags) | |
350 | continue; | |
351 | blk_mq_sched_free_tags(set, hctx, i); | |
352 | } | |
353 | ||
354 | return ret; | |
355 | } | |
356 | ||
357 | return 0; | |
358 | } | |
359 | ||
360 | void blk_mq_sched_teardown(struct request_queue *q) | |
361 | { | |
362 | struct blk_mq_tag_set *set = q->tag_set; | |
363 | struct blk_mq_hw_ctx *hctx; | |
364 | int i; | |
365 | ||
366 | queue_for_each_hw_ctx(q, hctx, i) | |
367 | blk_mq_sched_free_tags(set, hctx, i); | |
368 | } | |
d3484991 JA |
369 | |
370 | int blk_mq_sched_init(struct request_queue *q) | |
371 | { | |
372 | int ret; | |
373 | ||
374 | #if defined(CONFIG_DEFAULT_SQ_NONE) | |
375 | if (q->nr_hw_queues == 1) | |
376 | return 0; | |
377 | #endif | |
378 | #if defined(CONFIG_DEFAULT_MQ_NONE) | |
379 | if (q->nr_hw_queues > 1) | |
380 | return 0; | |
381 | #endif | |
382 | ||
383 | mutex_lock(&q->sysfs_lock); | |
384 | ret = elevator_init(q, NULL); | |
385 | mutex_unlock(&q->sysfs_lock); | |
386 | ||
387 | return ret; | |
388 | } |