]> git.proxmox.com Git - mirror_ubuntu-kernels.git/blame - block/blk-core.c
signal: Deliver all of the siginfo perf data in _perf
[mirror_ubuntu-kernels.git] / block / blk-core.c
CommitLineData
3dcf60bc 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
1da177e4
LT
3 * Copyright (C) 1991, 1992 Linus Torvalds
4 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
5 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
6 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6728cb0e
JA
7 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
8 * - July2000
1da177e4
LT
9 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
10 */
11
12/*
13 * This handles all read/write requests to block devices
14 */
1da177e4
LT
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/backing-dev.h>
18#include <linux/bio.h>
19#include <linux/blkdev.h>
320ae51f 20#include <linux/blk-mq.h>
52abca64 21#include <linux/blk-pm.h>
1da177e4
LT
22#include <linux/highmem.h>
23#include <linux/mm.h>
cee9a0c4 24#include <linux/pagemap.h>
1da177e4
LT
25#include <linux/kernel_stat.h>
26#include <linux/string.h>
27#include <linux/init.h>
1da177e4
LT
28#include <linux/completion.h>
29#include <linux/slab.h>
30#include <linux/swap.h>
31#include <linux/writeback.h>
faccbd4b 32#include <linux/task_io_accounting_ops.h>
c17bb495 33#include <linux/fault-inject.h>
73c10101 34#include <linux/list_sort.h>
e3c78ca5 35#include <linux/delay.h>
aaf7c680 36#include <linux/ratelimit.h>
6c954667 37#include <linux/pm_runtime.h>
eea8f41c 38#include <linux/blk-cgroup.h>
54d4e6ab 39#include <linux/t10-pi.h>
18fbda91 40#include <linux/debugfs.h>
30abb3a6 41#include <linux/bpf.h>
b8e24a93 42#include <linux/psi.h>
71ac860a 43#include <linux/sched/sysctl.h>
a892c8d5 44#include <linux/blk-crypto.h>
55782138
LZ
45
46#define CREATE_TRACE_POINTS
47#include <trace/events/block.h>
1da177e4 48
8324aa91 49#include "blk.h"
43a5e4e2 50#include "blk-mq.h"
bd166ef1 51#include "blk-mq-sched.h"
bca6b067 52#include "blk-pm.h"
c1c80384 53#include "blk-rq-qos.h"
8324aa91 54
18fbda91 55struct dentry *blk_debugfs_root;
18fbda91 56
d07335e5 57EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
b0da3f0d 58EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
0a82a8d1 59EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
3291fa57 60EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
cbae8d45 61EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
b357e4a6 62EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
0bfc2455 63
a73f730d
TH
64DEFINE_IDA(blk_queue_ida);
65
1da177e4
LT
66/*
67 * For queue allocation
68 */
6728cb0e 69struct kmem_cache *blk_requestq_cachep;
1da177e4 70
1da177e4
LT
71/*
72 * Controlling structure to kblockd
73 */
ff856bad 74static struct workqueue_struct *kblockd_workqueue;
1da177e4 75
8814ce8a
BVA
76/**
77 * blk_queue_flag_set - atomically set a queue flag
78 * @flag: flag to be set
79 * @q: request queue
80 */
81void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
82{
57d74df9 83 set_bit(flag, &q->queue_flags);
8814ce8a
BVA
84}
85EXPORT_SYMBOL(blk_queue_flag_set);
86
87/**
88 * blk_queue_flag_clear - atomically clear a queue flag
89 * @flag: flag to be cleared
90 * @q: request queue
91 */
92void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
93{
57d74df9 94 clear_bit(flag, &q->queue_flags);
8814ce8a
BVA
95}
96EXPORT_SYMBOL(blk_queue_flag_clear);
97
98/**
99 * blk_queue_flag_test_and_set - atomically test and set a queue flag
100 * @flag: flag to be set
101 * @q: request queue
102 *
103 * Returns the previous value of @flag - 0 if the flag was not set and 1 if
104 * the flag was already set.
105 */
106bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
107{
57d74df9 108 return test_and_set_bit(flag, &q->queue_flags);
8814ce8a
BVA
109}
110EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
111
2a4aa30c 112void blk_rq_init(struct request_queue *q, struct request *rq)
1da177e4 113{
1afb20f3
FT
114 memset(rq, 0, sizeof(*rq));
115
1da177e4 116 INIT_LIST_HEAD(&rq->queuelist);
63a71386 117 rq->q = q;
a2dec7b3 118 rq->__sector = (sector_t) -1;
2e662b65
JA
119 INIT_HLIST_NODE(&rq->hash);
120 RB_CLEAR_NODE(&rq->rb_node);
e44a6a23
XT
121 rq->tag = BLK_MQ_NO_TAG;
122 rq->internal_tag = BLK_MQ_NO_TAG;
522a7775 123 rq->start_time_ns = ktime_get_ns();
09e099d4 124 rq->part = NULL;
b554db14 125 refcount_set(&rq->ref, 1);
a892c8d5 126 blk_crypto_rq_set_defaults(rq);
1da177e4 127}
2a4aa30c 128EXPORT_SYMBOL(blk_rq_init);
1da177e4 129
e47bc4ed
CK
130#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
131static const char *const blk_op_name[] = {
132 REQ_OP_NAME(READ),
133 REQ_OP_NAME(WRITE),
134 REQ_OP_NAME(FLUSH),
135 REQ_OP_NAME(DISCARD),
136 REQ_OP_NAME(SECURE_ERASE),
137 REQ_OP_NAME(ZONE_RESET),
6e33dbf2 138 REQ_OP_NAME(ZONE_RESET_ALL),
6c1b1da5
AJ
139 REQ_OP_NAME(ZONE_OPEN),
140 REQ_OP_NAME(ZONE_CLOSE),
141 REQ_OP_NAME(ZONE_FINISH),
0512a75b 142 REQ_OP_NAME(ZONE_APPEND),
e47bc4ed
CK
143 REQ_OP_NAME(WRITE_SAME),
144 REQ_OP_NAME(WRITE_ZEROES),
145 REQ_OP_NAME(SCSI_IN),
146 REQ_OP_NAME(SCSI_OUT),
147 REQ_OP_NAME(DRV_IN),
148 REQ_OP_NAME(DRV_OUT),
149};
150#undef REQ_OP_NAME
151
152/**
153 * blk_op_str - Return string XXX in the REQ_OP_XXX.
154 * @op: REQ_OP_XXX.
155 *
156 * Description: Centralize block layer function to convert REQ_OP_XXX into
157 * string format. Useful in the debugging and tracing bio or request. For
158 * invalid REQ_OP_XXX it returns string "UNKNOWN".
159 */
160inline const char *blk_op_str(unsigned int op)
161{
162 const char *op_str = "UNKNOWN";
163
164 if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
165 op_str = blk_op_name[op];
166
167 return op_str;
168}
169EXPORT_SYMBOL_GPL(blk_op_str);
170
2a842aca
CH
171static const struct {
172 int errno;
173 const char *name;
174} blk_errors[] = {
175 [BLK_STS_OK] = { 0, "" },
176 [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
177 [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
178 [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
179 [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
180 [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
181 [BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
182 [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
183 [BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
184 [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
86ff7c2a 185 [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
03a07c92 186 [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
2a842aca 187
4e4cbee9
CH
188 /* device mapper special case, should not leak out: */
189 [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
190
3b481d91
KB
191 /* zone device specific errors */
192 [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
193 [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
194
2a842aca
CH
195 /* everything else not covered above: */
196 [BLK_STS_IOERR] = { -EIO, "I/O" },
197};
198
199blk_status_t errno_to_blk_status(int errno)
200{
201 int i;
202
203 for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
204 if (blk_errors[i].errno == errno)
205 return (__force blk_status_t)i;
206 }
207
208 return BLK_STS_IOERR;
209}
210EXPORT_SYMBOL_GPL(errno_to_blk_status);
211
212int blk_status_to_errno(blk_status_t status)
213{
214 int idx = (__force int)status;
215
34bd9c1c 216 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
2a842aca
CH
217 return -EIO;
218 return blk_errors[idx].errno;
219}
220EXPORT_SYMBOL_GPL(blk_status_to_errno);
221
178cc590
CH
222static void print_req_error(struct request *req, blk_status_t status,
223 const char *caller)
2a842aca
CH
224{
225 int idx = (__force int)status;
226
34bd9c1c 227 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
2a842aca
CH
228 return;
229
178cc590 230 printk_ratelimited(KERN_ERR
b0e5168a
CK
231 "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
232 "phys_seg %u prio class %u\n",
178cc590 233 caller, blk_errors[idx].name,
b0e5168a
CK
234 req->rq_disk ? req->rq_disk->disk_name : "?",
235 blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
236 req->cmd_flags & ~REQ_OP_MASK,
237 req->nr_phys_segments,
238 IOPRIO_PRIO_CLASS(req->ioprio));
2a842aca
CH
239}
240
5bb23a68 241static void req_bio_endio(struct request *rq, struct bio *bio,
2a842aca 242 unsigned int nbytes, blk_status_t error)
1da177e4 243{
78d8e58a 244 if (error)
4e4cbee9 245 bio->bi_status = error;
797e7dbb 246
e8064021 247 if (unlikely(rq->rq_flags & RQF_QUIET))
b7c44ed9 248 bio_set_flag(bio, BIO_QUIET);
08bafc03 249
f79ea416 250 bio_advance(bio, nbytes);
7ba1ba12 251
0512a75b
KB
252 if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
253 /*
254 * Partial zone append completions cannot be supported as the
255 * BIO fragments may end up not being written sequentially.
256 */
257 if (bio->bi_iter.bi_size)
258 bio->bi_status = BLK_STS_IOERR;
259 else
260 bio->bi_iter.bi_sector = rq->__sector;
261 }
262
143a87f4 263 /* don't actually finish bio if it's part of flush sequence */
e8064021 264 if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
4246a0b6 265 bio_endio(bio);
1da177e4 266}
1da177e4 267
1da177e4
LT
268void blk_dump_rq_flags(struct request *rq, char *msg)
269{
aebf526b
CH
270 printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
271 rq->rq_disk ? rq->rq_disk->disk_name : "?",
5953316d 272 (unsigned long long) rq->cmd_flags);
1da177e4 273
83096ebf
TH
274 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
275 (unsigned long long)blk_rq_pos(rq),
276 blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
b4f42e28
JA
277 printk(KERN_INFO " bio %p, biotail %p, len %u\n",
278 rq->bio, rq->biotail, blk_rq_bytes(rq));
1da177e4 279}
1da177e4
LT
280EXPORT_SYMBOL(blk_dump_rq_flags);
281
1da177e4
LT
282/**
283 * blk_sync_queue - cancel any pending callbacks on a queue
284 * @q: the queue
285 *
286 * Description:
287 * The block layer may perform asynchronous callback activity
288 * on a queue, such as calling the unplug function after a timeout.
289 * A block device may call blk_sync_queue to ensure that any
290 * such activity is cancelled, thus allowing it to release resources
59c51591 291 * that the callbacks might use. The caller must already have made sure
c62b37d9 292 * that its ->submit_bio will not re-add plugging prior to calling
1da177e4
LT
293 * this function.
294 *
da527770 295 * This function does not cancel any asynchronous activity arising
da3dae54 296 * out of elevator or throttling code. That would require elevator_exit()
5efd6113 297 * and blkcg_exit_queue() to be called with queue lock initialized.
da527770 298 *
1da177e4
LT
299 */
300void blk_sync_queue(struct request_queue *q)
301{
70ed28b9 302 del_timer_sync(&q->timeout);
4e9b6f20 303 cancel_work_sync(&q->timeout_work);
1da177e4
LT
304}
305EXPORT_SYMBOL(blk_sync_queue);
306
c9254f2d 307/**
cd84a62e 308 * blk_set_pm_only - increment pm_only counter
c9254f2d 309 * @q: request queue pointer
c9254f2d 310 */
cd84a62e 311void blk_set_pm_only(struct request_queue *q)
c9254f2d 312{
cd84a62e 313 atomic_inc(&q->pm_only);
c9254f2d 314}
cd84a62e 315EXPORT_SYMBOL_GPL(blk_set_pm_only);
c9254f2d 316
cd84a62e 317void blk_clear_pm_only(struct request_queue *q)
c9254f2d 318{
cd84a62e
BVA
319 int pm_only;
320
321 pm_only = atomic_dec_return(&q->pm_only);
322 WARN_ON_ONCE(pm_only < 0);
323 if (pm_only == 0)
324 wake_up_all(&q->mq_freeze_wq);
c9254f2d 325}
cd84a62e 326EXPORT_SYMBOL_GPL(blk_clear_pm_only);
c9254f2d 327
b5bd357c
LC
328/**
329 * blk_put_queue - decrement the request_queue refcount
330 * @q: the request_queue structure to decrement the refcount for
331 *
332 * Decrements the refcount of the request_queue kobject. When this reaches 0
333 * we'll have blk_release_queue() called.
e8c7d14a
LC
334 *
335 * Context: Any context, but the last reference must not be dropped from
336 * atomic context.
b5bd357c 337 */
165125e1 338void blk_put_queue(struct request_queue *q)
483f4afc
AV
339{
340 kobject_put(&q->kobj);
341}
d86e0e83 342EXPORT_SYMBOL(blk_put_queue);
483f4afc 343
aed3ea94
JA
344void blk_set_queue_dying(struct request_queue *q)
345{
8814ce8a 346 blk_queue_flag_set(QUEUE_FLAG_DYING, q);
aed3ea94 347
d3cfb2a0
ML
348 /*
349 * When queue DYING flag is set, we need to block new req
350 * entering queue, so we call blk_freeze_queue_start() to
351 * prevent I/O from crossing blk_queue_enter().
352 */
353 blk_freeze_queue_start(q);
354
344e9ffc 355 if (queue_is_mq(q))
aed3ea94 356 blk_mq_wake_waiters(q);
055f6e18
ML
357
358 /* Make blk_queue_enter() reexamine the DYING flag. */
359 wake_up_all(&q->mq_freeze_wq);
aed3ea94
JA
360}
361EXPORT_SYMBOL_GPL(blk_set_queue_dying);
362
c9a929dd
TH
363/**
364 * blk_cleanup_queue - shutdown a request queue
365 * @q: request queue to shutdown
366 *
c246e80d
BVA
367 * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
368 * put it. All future requests will be failed immediately with -ENODEV.
e8c7d14a
LC
369 *
370 * Context: can sleep
c94a96ac 371 */
6728cb0e 372void blk_cleanup_queue(struct request_queue *q)
483f4afc 373{
e8c7d14a
LC
374 /* cannot be called from atomic context */
375 might_sleep();
376
bae85c15
BVA
377 WARN_ON_ONCE(blk_queue_registered(q));
378
3f3299d5 379 /* mark @q DYING, no new request or merges will be allowed afterwards */
aed3ea94 380 blk_set_queue_dying(q);
6ecf23af 381
57d74df9
CH
382 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
383 blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
c9a929dd 384
c246e80d
BVA
385 /*
386 * Drain all requests queued before DYING marking. Set DEAD flag to
67ed8b73
BVA
387 * prevent that blk_mq_run_hw_queues() accesses the hardware queues
388 * after draining finished.
c246e80d 389 */
3ef28e83 390 blk_freeze_queue(q);
c57cdf7a
ML
391
392 rq_qos_exit(q);
393
57d74df9 394 blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
c9a929dd 395
5a48fc14
DW
396 /* for synchronous bio-based driver finish in-flight integrity i/o */
397 blk_flush_integrity();
398
c9a929dd 399 /* @q won't process any more request, flush async actions */
dc3b17cc 400 del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
c9a929dd
TH
401 blk_sync_queue(q);
402
344e9ffc 403 if (queue_is_mq(q))
c7e2d94b 404 blk_mq_exit_queue(q);
a1ce35fa 405
c3e22192
ML
406 /*
407 * In theory, request pool of sched_tags belongs to request queue.
408 * However, the current implementation requires tag_set for freeing
409 * requests, so free the pool now.
410 *
411 * Queue has become frozen, there can't be any in-queue requests, so
412 * it is safe to free requests now.
413 */
414 mutex_lock(&q->sysfs_lock);
415 if (q->elevator)
416 blk_mq_sched_free_requests(q);
417 mutex_unlock(&q->sysfs_lock);
418
3ef28e83 419 percpu_ref_exit(&q->q_usage_counter);
45a9c9d9 420
c9a929dd 421 /* @q is and will stay empty, shutdown and put */
483f4afc
AV
422 blk_put_queue(q);
423}
1da177e4
LT
424EXPORT_SYMBOL(blk_cleanup_queue);
425
3a0a5299
BVA
426/**
427 * blk_queue_enter() - try to increase q->q_usage_counter
428 * @q: request queue pointer
a4d34da7 429 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
3a0a5299 430 */
9a95e4ef 431int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
3ef28e83 432{
a4d34da7 433 const bool pm = flags & BLK_MQ_REQ_PM;
3a0a5299 434
3ef28e83 435 while (true) {
3a0a5299 436 bool success = false;
3ef28e83 437
818e0fa2 438 rcu_read_lock();
3a0a5299
BVA
439 if (percpu_ref_tryget_live(&q->q_usage_counter)) {
440 /*
cd84a62e
BVA
441 * The code that increments the pm_only counter is
442 * responsible for ensuring that that counter is
443 * globally visible before the queue is unfrozen.
3a0a5299 444 */
52abca64
AS
445 if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) ||
446 !blk_queue_pm_only(q)) {
3a0a5299
BVA
447 success = true;
448 } else {
449 percpu_ref_put(&q->q_usage_counter);
450 }
451 }
818e0fa2 452 rcu_read_unlock();
3a0a5299
BVA
453
454 if (success)
3ef28e83
DW
455 return 0;
456
3a0a5299 457 if (flags & BLK_MQ_REQ_NOWAIT)
3ef28e83
DW
458 return -EBUSY;
459
5ed61d3f 460 /*
1671d522 461 * read pair of barrier in blk_freeze_queue_start(),
5ed61d3f 462 * we need to order reading __PERCPU_REF_DEAD flag of
d3cfb2a0
ML
463 * .q_usage_counter and reading .mq_freeze_depth or
464 * queue dying flag, otherwise the following wait may
465 * never return if the two reads are reordered.
5ed61d3f
ML
466 */
467 smp_rmb();
468
1dc3039b 469 wait_event(q->mq_freeze_wq,
7996a8b5 470 (!q->mq_freeze_depth &&
52abca64 471 blk_pm_resume_queue(pm, q)) ||
1dc3039b 472 blk_queue_dying(q));
3ef28e83
DW
473 if (blk_queue_dying(q))
474 return -ENODEV;
3ef28e83
DW
475 }
476}
477
accea322
CH
478static inline int bio_queue_enter(struct bio *bio)
479{
309dca30 480 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
accea322
CH
481 bool nowait = bio->bi_opf & REQ_NOWAIT;
482 int ret;
483
484 ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
485 if (unlikely(ret)) {
486 if (nowait && !blk_queue_dying(q))
487 bio_wouldblock_error(bio);
488 else
489 bio_io_error(bio);
490 }
491
492 return ret;
493}
494
3ef28e83
DW
495void blk_queue_exit(struct request_queue *q)
496{
497 percpu_ref_put(&q->q_usage_counter);
498}
499
500static void blk_queue_usage_counter_release(struct percpu_ref *ref)
501{
502 struct request_queue *q =
503 container_of(ref, struct request_queue, q_usage_counter);
504
505 wake_up_all(&q->mq_freeze_wq);
506}
507
bca237a5 508static void blk_rq_timed_out_timer(struct timer_list *t)
287922eb 509{
bca237a5 510 struct request_queue *q = from_timer(q, t, timeout);
287922eb
CH
511
512 kblockd_schedule_work(&q->timeout_work);
513}
514
2e3c18d0
TH
515static void blk_timeout_work(struct work_struct *work)
516{
517}
518
c62b37d9 519struct request_queue *blk_alloc_queue(int node_id)
1946089a 520{
165125e1 521 struct request_queue *q;
338aa96d 522 int ret;
1946089a 523
8324aa91 524 q = kmem_cache_alloc_node(blk_requestq_cachep,
3d745ea5 525 GFP_KERNEL | __GFP_ZERO, node_id);
1da177e4
LT
526 if (!q)
527 return NULL;
528
cbf62af3 529 q->last_merge = NULL;
cbf62af3 530
3d745ea5 531 q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
a73f730d 532 if (q->id < 0)
3d2936f4 533 goto fail_q;
a73f730d 534
c495a176 535 ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
338aa96d 536 if (ret)
54efd50b
KO
537 goto fail_id;
538
aef33c2f 539 q->backing_dev_info = bdi_alloc(node_id);
d03f6cdc
JK
540 if (!q->backing_dev_info)
541 goto fail_split;
542
a83b576c
JA
543 q->stats = blk_alloc_queue_stats();
544 if (!q->stats)
545 goto fail_stats;
546
5151412d 547 q->node = node_id;
0989a025 548
bccf5e26
JG
549 atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
550
bca237a5
KC
551 timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
552 laptop_mode_timer_fn, 0);
553 timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
2e3c18d0 554 INIT_WORK(&q->timeout_work, blk_timeout_work);
a612fddf 555 INIT_LIST_HEAD(&q->icq_list);
4eef3049 556#ifdef CONFIG_BLK_CGROUP
e8989fae 557 INIT_LIST_HEAD(&q->blkg_list);
4eef3049 558#endif
483f4afc 559
8324aa91 560 kobject_init(&q->kobj, &blk_queue_ktype);
1da177e4 561
85e0cbbb 562 mutex_init(&q->debugfs_mutex);
483f4afc 563 mutex_init(&q->sysfs_lock);
cecf5d87 564 mutex_init(&q->sysfs_dir_lock);
0d945c1f 565 spin_lock_init(&q->queue_lock);
c94a96ac 566
320ae51f 567 init_waitqueue_head(&q->mq_freeze_wq);
7996a8b5 568 mutex_init(&q->mq_freeze_lock);
320ae51f 569
3ef28e83
DW
570 /*
571 * Init percpu_ref in atomic mode so that it's faster to shutdown.
572 * See blk_register_queue() for details.
573 */
574 if (percpu_ref_init(&q->q_usage_counter,
575 blk_queue_usage_counter_release,
576 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
fff4996b 577 goto fail_bdi;
f51b802c 578
3ef28e83
DW
579 if (blkcg_init_queue(q))
580 goto fail_ref;
581
3d745ea5
CH
582 blk_queue_dma_alignment(q, 511);
583 blk_set_default_limits(&q->limits);
c62b37d9 584 q->nr_requests = BLKDEV_MAX_RQ;
3d745ea5 585
1da177e4 586 return q;
a73f730d 587
3ef28e83
DW
588fail_ref:
589 percpu_ref_exit(&q->q_usage_counter);
fff4996b 590fail_bdi:
a83b576c
JA
591 blk_free_queue_stats(q->stats);
592fail_stats:
d03f6cdc 593 bdi_put(q->backing_dev_info);
54efd50b 594fail_split:
338aa96d 595 bioset_exit(&q->bio_split);
a73f730d
TH
596fail_id:
597 ida_simple_remove(&blk_queue_ida, q->id);
598fail_q:
599 kmem_cache_free(blk_requestq_cachep, q);
600 return NULL;
1da177e4 601}
3d745ea5 602EXPORT_SYMBOL(blk_alloc_queue);
1da177e4 603
b5bd357c
LC
604/**
605 * blk_get_queue - increment the request_queue refcount
606 * @q: the request_queue structure to increment the refcount for
607 *
608 * Increment the refcount of the request_queue kobject.
763b5892
LC
609 *
610 * Context: Any context.
b5bd357c 611 */
09ac46c4 612bool blk_get_queue(struct request_queue *q)
1da177e4 613{
3f3299d5 614 if (likely(!blk_queue_dying(q))) {
09ac46c4
TH
615 __blk_get_queue(q);
616 return true;
1da177e4
LT
617 }
618
09ac46c4 619 return false;
1da177e4 620}
d86e0e83 621EXPORT_SYMBOL(blk_get_queue);
1da177e4 622
a1ce35fa
JA
623/**
624 * blk_get_request - allocate a request
625 * @q: request queue to allocate a request for
626 * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
627 * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
1da177e4 628 */
a1ce35fa
JA
629struct request *blk_get_request(struct request_queue *q, unsigned int op,
630 blk_mq_req_flags_t flags)
1da177e4 631{
a1ce35fa 632 struct request *req;
1da177e4 633
a1ce35fa 634 WARN_ON_ONCE(op & REQ_NOWAIT);
a4d34da7 635 WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
1da177e4 636
a1ce35fa
JA
637 req = blk_mq_alloc_request(q, op, flags);
638 if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
639 q->mq_ops->initialize_rq_fn(req);
1da177e4 640
a1ce35fa 641 return req;
1da177e4 642}
a1ce35fa 643EXPORT_SYMBOL(blk_get_request);
1da177e4 644
1da177e4
LT
645void blk_put_request(struct request *req)
646{
a1ce35fa 647 blk_mq_free_request(req);
1da177e4 648}
1da177e4
LT
649EXPORT_SYMBOL(blk_put_request);
650
52c5e62d 651static void handle_bad_sector(struct bio *bio, sector_t maxsector)
1da177e4
LT
652{
653 char b[BDEVNAME_SIZE];
654
f4ac712e
TH
655 pr_info_ratelimited("attempt to access beyond end of device\n"
656 "%s: rw=%d, want=%llu, limit=%llu\n",
657 bio_devname(bio, b), bio->bi_opf,
658 bio_end_sector(bio), maxsector);
1da177e4
LT
659}
660
c17bb495
AM
661#ifdef CONFIG_FAIL_MAKE_REQUEST
662
663static DECLARE_FAULT_ATTR(fail_make_request);
664
665static int __init setup_fail_make_request(char *str)
666{
667 return setup_fault_attr(&fail_make_request, str);
668}
669__setup("fail_make_request=", setup_fail_make_request);
670
8446fe92 671static bool should_fail_request(struct block_device *part, unsigned int bytes)
c17bb495 672{
8446fe92 673 return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
c17bb495
AM
674}
675
676static int __init fail_make_request_debugfs(void)
677{
dd48c085
AM
678 struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
679 NULL, &fail_make_request);
680
21f9fcd8 681 return PTR_ERR_OR_ZERO(dir);
c17bb495
AM
682}
683
684late_initcall(fail_make_request_debugfs);
685
686#else /* CONFIG_FAIL_MAKE_REQUEST */
687
8446fe92 688static inline bool should_fail_request(struct block_device *part,
b2c9cd37 689 unsigned int bytes)
c17bb495 690{
b2c9cd37 691 return false;
c17bb495
AM
692}
693
694#endif /* CONFIG_FAIL_MAKE_REQUEST */
695
2f9f6221 696static inline bool bio_check_ro(struct bio *bio)
721c7fc7 697{
2f9f6221 698 if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
721c7fc7
ID
699 char b[BDEVNAME_SIZE];
700
8b2ded1c
MP
701 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
702 return false;
703
a32e236e 704 WARN_ONCE(1,
c8178674 705 "Trying to write to read-only block-device %s (partno %d)\n",
2f9f6221 706 bio_devname(bio, b), bio->bi_bdev->bd_partno);
a32e236e
LT
707 /* Older lvm-tools actually trigger this */
708 return false;
721c7fc7
ID
709 }
710
711 return false;
712}
713
30abb3a6
HM
714static noinline int should_fail_bio(struct bio *bio)
715{
309dca30 716 if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
30abb3a6
HM
717 return -EIO;
718 return 0;
719}
720ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
721
52c5e62d
CH
722/*
723 * Check whether this bio extends beyond the end of the device or partition.
724 * This may well happen - the kernel calls bread() without checking the size of
725 * the device, e.g., when mounting a file system.
726 */
2f9f6221 727static inline int bio_check_eod(struct bio *bio)
52c5e62d 728{
2f9f6221 729 sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
52c5e62d
CH
730 unsigned int nr_sectors = bio_sectors(bio);
731
732 if (nr_sectors && maxsector &&
733 (nr_sectors > maxsector ||
734 bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
735 handle_bad_sector(bio, maxsector);
736 return -EIO;
737 }
738 return 0;
739}
740
74d46992
CH
741/*
742 * Remap block n of partition p to block n+start(p) of the disk.
743 */
2f9f6221 744static int blk_partition_remap(struct bio *bio)
74d46992 745{
309dca30 746 struct block_device *p = bio->bi_bdev;
74d46992 747
52c5e62d 748 if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
2f9f6221 749 return -EIO;
5eac3eb3 750 if (bio_sectors(bio)) {
8446fe92 751 bio->bi_iter.bi_sector += p->bd_start_sect;
1c02fca6 752 trace_block_bio_remap(bio, p->bd_dev,
29ff57c6 753 bio->bi_iter.bi_sector -
8446fe92 754 p->bd_start_sect);
52c5e62d 755 }
30c5d345 756 bio_set_flag(bio, BIO_REMAPPED);
2f9f6221 757 return 0;
74d46992
CH
758}
759
0512a75b
KB
760/*
761 * Check write append to a zoned block device.
762 */
763static inline blk_status_t blk_check_zone_append(struct request_queue *q,
764 struct bio *bio)
765{
766 sector_t pos = bio->bi_iter.bi_sector;
767 int nr_sectors = bio_sectors(bio);
768
769 /* Only applicable to zoned block devices */
770 if (!blk_queue_is_zoned(q))
771 return BLK_STS_NOTSUPP;
772
773 /* The bio sector must point to the start of a sequential zone */
774 if (pos & (blk_queue_zone_sectors(q) - 1) ||
775 !blk_queue_zone_is_seq(q, pos))
776 return BLK_STS_IOERR;
777
778 /*
779 * Not allowed to cross zone boundaries. Otherwise, the BIO will be
780 * split and could result in non-contiguous sectors being written in
781 * different zones.
782 */
783 if (nr_sectors > q->limits.chunk_sectors)
784 return BLK_STS_IOERR;
785
786 /* Make sure the BIO is small enough and will not get split */
787 if (nr_sectors > q->limits.max_zone_append_sectors)
788 return BLK_STS_IOERR;
789
790 bio->bi_opf |= REQ_NOMERGE;
791
792 return BLK_STS_OK;
793}
794
ed00aabd 795static noinline_for_stack bool submit_bio_checks(struct bio *bio)
1da177e4 796{
309dca30
CH
797 struct block_device *bdev = bio->bi_bdev;
798 struct request_queue *q = bdev->bd_disk->queue;
4e4cbee9 799 blk_status_t status = BLK_STS_IOERR;
5a473e83 800 struct blk_plug *plug;
1da177e4
LT
801
802 might_sleep();
1da177e4 803
5a473e83
JA
804 plug = blk_mq_plug(q, bio);
805 if (plug && plug->nowait)
806 bio->bi_opf |= REQ_NOWAIT;
807
03a07c92 808 /*
b0beb280 809 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
021a2446 810 * if queue does not support NOWAIT.
03a07c92 811 */
021a2446 812 if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
b0beb280 813 goto not_supported;
03a07c92 814
30abb3a6 815 if (should_fail_bio(bio))
5a7bbad2 816 goto end_io;
2f9f6221
CH
817 if (unlikely(bio_check_ro(bio)))
818 goto end_io;
3a905c37
CH
819 if (!bio_flagged(bio, BIO_REMAPPED)) {
820 if (unlikely(bio_check_eod(bio)))
821 goto end_io;
822 if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
823 goto end_io;
824 }
2056a782 825
5a7bbad2 826 /*
ed00aabd
CH
827 * Filter flush bio's early so that bio based drivers without flush
828 * support don't have to worry about them.
5a7bbad2 829 */
f3a8ab7d 830 if (op_is_flush(bio->bi_opf) &&
c888a8f9 831 !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
1eff9d32 832 bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
e439ab71 833 if (!bio_sectors(bio)) {
4e4cbee9 834 status = BLK_STS_OK;
51fd77bd
JA
835 goto end_io;
836 }
5a7bbad2 837 }
5ddfe969 838
d04c406f
CH
839 if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
840 bio->bi_opf &= ~REQ_HIPRI;
841
288dab8a
CH
842 switch (bio_op(bio)) {
843 case REQ_OP_DISCARD:
844 if (!blk_queue_discard(q))
845 goto not_supported;
846 break;
847 case REQ_OP_SECURE_ERASE:
848 if (!blk_queue_secure_erase(q))
849 goto not_supported;
850 break;
851 case REQ_OP_WRITE_SAME:
74d46992 852 if (!q->limits.max_write_same_sectors)
288dab8a 853 goto not_supported;
58886785 854 break;
0512a75b
KB
855 case REQ_OP_ZONE_APPEND:
856 status = blk_check_zone_append(q, bio);
857 if (status != BLK_STS_OK)
858 goto end_io;
859 break;
2d253440 860 case REQ_OP_ZONE_RESET:
6c1b1da5
AJ
861 case REQ_OP_ZONE_OPEN:
862 case REQ_OP_ZONE_CLOSE:
863 case REQ_OP_ZONE_FINISH:
74d46992 864 if (!blk_queue_is_zoned(q))
2d253440 865 goto not_supported;
288dab8a 866 break;
6e33dbf2
CK
867 case REQ_OP_ZONE_RESET_ALL:
868 if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
869 goto not_supported;
870 break;
a6f0788e 871 case REQ_OP_WRITE_ZEROES:
74d46992 872 if (!q->limits.max_write_zeroes_sectors)
a6f0788e
CK
873 goto not_supported;
874 break;
288dab8a
CH
875 default:
876 break;
5a7bbad2 877 }
01edede4 878
7f4b35d1 879 /*
3e82c348
CH
880 * Various block parts want %current->io_context, so allocate it up
881 * front rather than dealing with lots of pain to allocate it only
882 * where needed. This may fail and the block layer knows how to live
883 * with it.
7f4b35d1 884 */
3e82c348
CH
885 if (unlikely(!current->io_context))
886 create_task_io_context(current, GFP_ATOMIC, q->node);
7f4b35d1 887
db18a53e
CH
888 if (blk_throtl_bio(bio)) {
889 blkcg_bio_issue_init(bio);
ae118896 890 return false;
db18a53e
CH
891 }
892
893 blk_cgroup_bio_start(bio);
894 blkcg_bio_issue_init(bio);
27a84d54 895
fbbaf700 896 if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
e8a676d6 897 trace_block_bio_queue(bio);
fbbaf700
N
898 /* Now that enqueuing has been traced, we need to trace
899 * completion as well.
900 */
901 bio_set_flag(bio, BIO_TRACE_COMPLETION);
902 }
27a84d54 903 return true;
a7384677 904
288dab8a 905not_supported:
4e4cbee9 906 status = BLK_STS_NOTSUPP;
a7384677 907end_io:
4e4cbee9 908 bio->bi_status = status;
4246a0b6 909 bio_endio(bio);
27a84d54 910 return false;
1da177e4
LT
911}
912
ed00aabd 913static blk_qc_t __submit_bio(struct bio *bio)
ac7c5675 914{
309dca30 915 struct gendisk *disk = bio->bi_bdev->bd_disk;
ac7c5675
CH
916 blk_qc_t ret = BLK_QC_T_NONE;
917
918 if (blk_crypto_bio_prep(&bio)) {
c62b37d9
CH
919 if (!disk->fops->submit_bio)
920 return blk_mq_submit_bio(bio);
921 ret = disk->fops->submit_bio(bio);
ac7c5675 922 }
c62b37d9 923 blk_queue_exit(disk->queue);
ac7c5675
CH
924 return ret;
925}
926
566acf2d
CH
927/*
928 * The loop in this function may be a bit non-obvious, and so deserves some
929 * explanation:
930 *
931 * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
932 * that), so we have a list with a single bio.
933 * - We pretend that we have just taken it off a longer list, so we assign
934 * bio_list to a pointer to the bio_list_on_stack, thus initialising the
935 * bio_list of new bios to be added. ->submit_bio() may indeed add some more
936 * bios through a recursive call to submit_bio_noacct. If it did, we find a
937 * non-NULL value in bio_list and re-enter the loop from the top.
938 * - In this case we really did just take the bio of the top of the list (no
939 * pretending) and so remove it from bio_list, and call into ->submit_bio()
940 * again.
941 *
942 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
943 * bio_list_on_stack[1] contains bios that were submitted before the current
944 * ->submit_bio_bio, but that haven't been processed yet.
945 */
946static blk_qc_t __submit_bio_noacct(struct bio *bio)
947{
948 struct bio_list bio_list_on_stack[2];
949 blk_qc_t ret = BLK_QC_T_NONE;
950
951 BUG_ON(bio->bi_next);
952
953 bio_list_init(&bio_list_on_stack[0]);
954 current->bio_list = bio_list_on_stack;
955
956 do {
309dca30 957 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
566acf2d
CH
958 struct bio_list lower, same;
959
960 if (unlikely(bio_queue_enter(bio) != 0))
961 continue;
962
963 /*
964 * Create a fresh bio_list for all subordinate requests.
965 */
966 bio_list_on_stack[1] = bio_list_on_stack[0];
967 bio_list_init(&bio_list_on_stack[0]);
968
969 ret = __submit_bio(bio);
970
971 /*
972 * Sort new bios into those for a lower level and those for the
973 * same level.
974 */
975 bio_list_init(&lower);
976 bio_list_init(&same);
977 while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
309dca30 978 if (q == bio->bi_bdev->bd_disk->queue)
566acf2d
CH
979 bio_list_add(&same, bio);
980 else
981 bio_list_add(&lower, bio);
982
983 /*
984 * Now assemble so we handle the lowest level first.
985 */
986 bio_list_merge(&bio_list_on_stack[0], &lower);
987 bio_list_merge(&bio_list_on_stack[0], &same);
988 bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
989 } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
990
991 current->bio_list = NULL;
992 return ret;
993}
994
ff93ea0c
CH
995static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
996{
7c792f33 997 struct bio_list bio_list[2] = { };
ff93ea0c
CH
998 blk_qc_t ret = BLK_QC_T_NONE;
999
7c792f33 1000 current->bio_list = bio_list;
ff93ea0c
CH
1001
1002 do {
309dca30 1003 struct gendisk *disk = bio->bi_bdev->bd_disk;
ff93ea0c
CH
1004
1005 if (unlikely(bio_queue_enter(bio) != 0))
1006 continue;
1007
1008 if (!blk_crypto_bio_prep(&bio)) {
1009 blk_queue_exit(disk->queue);
1010 ret = BLK_QC_T_NONE;
1011 continue;
1012 }
1013
1014 ret = blk_mq_submit_bio(bio);
7c792f33 1015 } while ((bio = bio_list_pop(&bio_list[0])));
ff93ea0c
CH
1016
1017 current->bio_list = NULL;
1018 return ret;
1019}
1020
27a84d54 1021/**
ed00aabd 1022 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
27a84d54
CH
1023 * @bio: The bio describing the location in memory and on the device.
1024 *
3fdd4086
CH
1025 * This is a version of submit_bio() that shall only be used for I/O that is
1026 * resubmitted to lower level drivers by stacking block drivers. All file
1027 * systems and other upper level users of the block layer should use
1028 * submit_bio() instead.
d89d8796 1029 */
ed00aabd 1030blk_qc_t submit_bio_noacct(struct bio *bio)
d89d8796 1031{
ed00aabd 1032 if (!submit_bio_checks(bio))
566acf2d 1033 return BLK_QC_T_NONE;
27a84d54
CH
1034
1035 /*
566acf2d
CH
1036 * We only want one ->submit_bio to be active at a time, else stack
1037 * usage with stacked devices could be a problem. Use current->bio_list
1038 * to collect a list of requests submited by a ->submit_bio method while
1039 * it is active, and then process them after it returned.
27a84d54 1040 */
bddd87c7 1041 if (current->bio_list) {
f5fe1b51 1042 bio_list_add(&current->bio_list[0], bio);
566acf2d 1043 return BLK_QC_T_NONE;
d89d8796 1044 }
27a84d54 1045
309dca30 1046 if (!bio->bi_bdev->bd_disk->fops->submit_bio)
ff93ea0c 1047 return __submit_bio_noacct_mq(bio);
566acf2d 1048 return __submit_bio_noacct(bio);
d89d8796 1049}
ed00aabd 1050EXPORT_SYMBOL(submit_bio_noacct);
1da177e4
LT
1051
1052/**
710027a4 1053 * submit_bio - submit a bio to the block device layer for I/O
1da177e4
LT
1054 * @bio: The &struct bio which describes the I/O
1055 *
3fdd4086
CH
1056 * submit_bio() is used to submit I/O requests to block devices. It is passed a
1057 * fully set up &struct bio that describes the I/O that needs to be done. The
309dca30 1058 * bio will be send to the device described by the bi_bdev field.
1da177e4 1059 *
3fdd4086
CH
1060 * The success/failure status of the request, along with notification of
1061 * completion, is delivered asynchronously through the ->bi_end_io() callback
1062 * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
1063 * been called.
1da177e4 1064 */
4e49ea4a 1065blk_qc_t submit_bio(struct bio *bio)
1da177e4 1066{
d3f77dfd
TH
1067 if (blkcg_punt_bio_submit(bio))
1068 return BLK_QC_T_NONE;
1069
bf2de6f5
JA
1070 /*
1071 * If it's a regular read/write or a barrier with data attached,
1072 * go through the normal accounting stuff before submission.
1073 */
e2a60da7 1074 if (bio_has_data(bio)) {
4363ac7c
MP
1075 unsigned int count;
1076
95fe6c1a 1077 if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
309dca30
CH
1078 count = queue_logical_block_size(
1079 bio->bi_bdev->bd_disk->queue) >> 9;
4363ac7c
MP
1080 else
1081 count = bio_sectors(bio);
1082
a8ebb056 1083 if (op_is_write(bio_op(bio))) {
bf2de6f5
JA
1084 count_vm_events(PGPGOUT, count);
1085 } else {
4f024f37 1086 task_io_account_read(bio->bi_iter.bi_size);
bf2de6f5
JA
1087 count_vm_events(PGPGIN, count);
1088 }
1089
1090 if (unlikely(block_dump)) {
1091 char b[BDEVNAME_SIZE];
8dcbdc74 1092 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
ba25f9dc 1093 current->comm, task_pid_nr(current),
a8ebb056 1094 op_is_write(bio_op(bio)) ? "WRITE" : "READ",
4f024f37 1095 (unsigned long long)bio->bi_iter.bi_sector,
74d46992 1096 bio_devname(bio, b), count);
bf2de6f5 1097 }
1da177e4
LT
1098 }
1099
b8e24a93 1100 /*
760f83ea
CH
1101 * If we're reading data that is part of the userspace workingset, count
1102 * submission time as memory stall. When the device is congested, or
1103 * the submitting cgroup IO-throttled, submission can be a significant
1104 * part of overall IO time.
b8e24a93 1105 */
760f83ea
CH
1106 if (unlikely(bio_op(bio) == REQ_OP_READ &&
1107 bio_flagged(bio, BIO_WORKINGSET))) {
1108 unsigned long pflags;
1109 blk_qc_t ret;
b8e24a93 1110
760f83ea 1111 psi_memstall_enter(&pflags);
ed00aabd 1112 ret = submit_bio_noacct(bio);
b8e24a93
JW
1113 psi_memstall_leave(&pflags);
1114
760f83ea
CH
1115 return ret;
1116 }
1117
ed00aabd 1118 return submit_bio_noacct(bio);
1da177e4 1119}
1da177e4
LT
1120EXPORT_SYMBOL(submit_bio);
1121
82124d60 1122/**
bf4e6b4e 1123 * blk_cloned_rq_check_limits - Helper function to check a cloned request
0d720318 1124 * for the new queue limits
82124d60
KU
1125 * @q: the queue
1126 * @rq: the request being checked
1127 *
1128 * Description:
1129 * @rq may have been made based on weaker limitations of upper-level queues
1130 * in request stacking drivers, and it may violate the limitation of @q.
1131 * Since the block layer and the underlying device driver trust @rq
1132 * after it is inserted to @q, it should be checked against @q before
1133 * the insertion using this generic function.
1134 *
82124d60 1135 * Request stacking drivers like request-based dm may change the queue
bf4e6b4e
HR
1136 * limits when retrying requests on other queues. Those requests need
1137 * to be checked against the new queue limits again during dispatch.
82124d60 1138 */
143d2600 1139static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
bf4e6b4e 1140 struct request *rq)
82124d60 1141{
8327cce5
RS
1142 unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
1143
1144 if (blk_rq_sectors(rq) > max_sectors) {
1145 /*
1146 * SCSI device does not have a good way to return if
1147 * Write Same/Zero is actually supported. If a device rejects
1148 * a non-read/write command (discard, write same,etc.) the
1149 * low-level device driver will set the relevant queue limit to
1150 * 0 to prevent blk-lib from issuing more of the offending
1151 * operations. Commands queued prior to the queue limit being
1152 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
1153 * errors being propagated to upper layers.
1154 */
1155 if (max_sectors == 0)
1156 return BLK_STS_NOTSUPP;
1157
61939b12 1158 printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
8327cce5 1159 __func__, blk_rq_sectors(rq), max_sectors);
143d2600 1160 return BLK_STS_IOERR;
82124d60
KU
1161 }
1162
1163 /*
1164 * queue's settings related to segment counting like q->bounce_pfn
1165 * may differ from that of other stacking queues.
1166 * Recalculate it to check the request correctly on this queue's
1167 * limitation.
1168 */
e9cd19c0 1169 rq->nr_phys_segments = blk_recalc_rq_segments(rq);
8a78362c 1170 if (rq->nr_phys_segments > queue_max_segments(q)) {
61939b12
JP
1171 printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
1172 __func__, rq->nr_phys_segments, queue_max_segments(q));
143d2600 1173 return BLK_STS_IOERR;
82124d60
KU
1174 }
1175
143d2600 1176 return BLK_STS_OK;
82124d60 1177}
82124d60
KU
1178
1179/**
1180 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
1181 * @q: the queue to submit the request
1182 * @rq: the request being queued
1183 */
2a842aca 1184blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
82124d60 1185{
8327cce5
RS
1186 blk_status_t ret;
1187
1188 ret = blk_cloned_rq_check_limits(q, rq);
1189 if (ret != BLK_STS_OK)
1190 return ret;
82124d60 1191
b2c9cd37 1192 if (rq->rq_disk &&
8446fe92 1193 should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq)))
2a842aca 1194 return BLK_STS_IOERR;
82124d60 1195
a892c8d5
ST
1196 if (blk_crypto_insert_cloned_request(rq))
1197 return BLK_STS_IOERR;
1198
a1ce35fa 1199 if (blk_queue_io_stat(q))
b5af37ab 1200 blk_account_io_start(rq);
82124d60
KU
1201
1202 /*
a1ce35fa
JA
1203 * Since we have a scheduler attached on the top device,
1204 * bypass a potential scheduler on the bottom device for
1205 * insert.
82124d60 1206 */
fd9c40f6 1207 return blk_mq_request_issue_directly(rq, true);
82124d60
KU
1208}
1209EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
1210
80a761fd
TH
1211/**
1212 * blk_rq_err_bytes - determine number of bytes till the next failure boundary
1213 * @rq: request to examine
1214 *
1215 * Description:
1216 * A request could be merge of IOs which require different failure
1217 * handling. This function determines the number of bytes which
1218 * can be failed from the beginning of the request without
1219 * crossing into area which need to be retried further.
1220 *
1221 * Return:
1222 * The number of bytes to fail.
80a761fd
TH
1223 */
1224unsigned int blk_rq_err_bytes(const struct request *rq)
1225{
1226 unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
1227 unsigned int bytes = 0;
1228 struct bio *bio;
1229
e8064021 1230 if (!(rq->rq_flags & RQF_MIXED_MERGE))
80a761fd
TH
1231 return blk_rq_bytes(rq);
1232
1233 /*
1234 * Currently the only 'mixing' which can happen is between
1235 * different fastfail types. We can safely fail portions
1236 * which have all the failfast bits that the first one has -
1237 * the ones which are at least as eager to fail as the first
1238 * one.
1239 */
1240 for (bio = rq->bio; bio; bio = bio->bi_next) {
1eff9d32 1241 if ((bio->bi_opf & ff) != ff)
80a761fd 1242 break;
4f024f37 1243 bytes += bio->bi_iter.bi_size;
80a761fd
TH
1244 }
1245
1246 /* this could lead to infinite loop */
1247 BUG_ON(blk_rq_bytes(rq) && !bytes);
1248 return bytes;
1249}
1250EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
1251
8446fe92
CH
1252static void update_io_ticks(struct block_device *part, unsigned long now,
1253 bool end)
9123bf6f
CH
1254{
1255 unsigned long stamp;
1256again:
8446fe92 1257 stamp = READ_ONCE(part->bd_stamp);
9123bf6f 1258 if (unlikely(stamp != now)) {
8446fe92 1259 if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp))
9123bf6f
CH
1260 __part_stat_add(part, io_ticks, end ? now - stamp : 1);
1261 }
8446fe92
CH
1262 if (part->bd_partno) {
1263 part = bdev_whole(part);
9123bf6f
CH
1264 goto again;
1265 }
1266}
1267
f1394b79 1268static void blk_account_io_completion(struct request *req, unsigned int bytes)
bc58ba94 1269{
ecb6186c 1270 if (req->part && blk_do_io_stat(req)) {
ddcf35d3 1271 const int sgrp = op_stat_group(req_op(req));
bc58ba94 1272
112f158f 1273 part_stat_lock();
8446fe92 1274 part_stat_add(req->part, sectors[sgrp], bytes >> 9);
bc58ba94
JA
1275 part_stat_unlock();
1276 }
1277}
1278
522a7775 1279void blk_account_io_done(struct request *req, u64 now)
bc58ba94 1280{
bc58ba94 1281 /*
dd4c133f
TH
1282 * Account IO completion. flush_rq isn't accounted as a
1283 * normal IO on queueing nor completion. Accounting the
1284 * containing request is enough.
bc58ba94 1285 */
ecb6186c
LG
1286 if (req->part && blk_do_io_stat(req) &&
1287 !(req->rq_flags & RQF_FLUSH_SEQ)) {
ddcf35d3 1288 const int sgrp = op_stat_group(req_op(req));
bc58ba94 1289
112f158f 1290 part_stat_lock();
8446fe92
CH
1291 update_io_ticks(req->part, jiffies, true);
1292 part_stat_inc(req->part, ios[sgrp]);
1293 part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
524f9ffd 1294 part_stat_unlock();
bc58ba94
JA
1295 }
1296}
1297
b5af37ab 1298void blk_account_io_start(struct request *rq)
320ae51f 1299{
320ae51f
JA
1300 if (!blk_do_io_stat(rq))
1301 return;
1302
0b6e522c
CH
1303 /* passthrough requests can hold bios that do not have ->bi_bdev set */
1304 if (rq->bio && rq->bio->bi_bdev)
1305 rq->part = rq->bio->bi_bdev;
1306 else
1307 rq->part = rq->rq_disk->part0;
524f9ffd 1308
112f158f 1309 part_stat_lock();
76268f3a 1310 update_io_ticks(rq->part, jiffies, false);
320ae51f
JA
1311 part_stat_unlock();
1312}
320ae51f 1313
8446fe92 1314static unsigned long __part_start_io_acct(struct block_device *part,
7b26410b 1315 unsigned int sectors, unsigned int op)
956d510e 1316{
956d510e
CH
1317 const int sgrp = op_stat_group(op);
1318 unsigned long now = READ_ONCE(jiffies);
1319
1320 part_stat_lock();
1321 update_io_ticks(part, now, false);
1322 part_stat_inc(part, ios[sgrp]);
1323 part_stat_add(part, sectors[sgrp], sectors);
1324 part_stat_local_inc(part, in_flight[op_is_write(op)]);
1325 part_stat_unlock();
320ae51f 1326
956d510e
CH
1327 return now;
1328}
7b26410b 1329
99dfc43e
CH
1330/**
1331 * bio_start_io_acct - start I/O accounting for bio based drivers
1332 * @bio: bio to start account for
1333 *
1334 * Returns the start time that should be passed back to bio_end_io_acct().
1335 */
1336unsigned long bio_start_io_acct(struct bio *bio)
7b26410b 1337{
99dfc43e 1338 return __part_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio));
7b26410b 1339}
99dfc43e 1340EXPORT_SYMBOL_GPL(bio_start_io_acct);
7b26410b
SL
1341
1342unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
1343 unsigned int op)
1344{
8446fe92 1345 return __part_start_io_acct(disk->part0, sectors, op);
7b26410b 1346}
956d510e
CH
1347EXPORT_SYMBOL(disk_start_io_acct);
1348
8446fe92 1349static void __part_end_io_acct(struct block_device *part, unsigned int op,
7b26410b 1350 unsigned long start_time)
956d510e 1351{
956d510e
CH
1352 const int sgrp = op_stat_group(op);
1353 unsigned long now = READ_ONCE(jiffies);
1354 unsigned long duration = now - start_time;
5b18b5a7 1355
956d510e
CH
1356 part_stat_lock();
1357 update_io_ticks(part, now, true);
1358 part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
1359 part_stat_local_dec(part, in_flight[op_is_write(op)]);
320ae51f
JA
1360 part_stat_unlock();
1361}
7b26410b 1362
99dfc43e
CH
1363void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
1364 struct block_device *orig_bdev)
7b26410b 1365{
99dfc43e 1366 __part_end_io_acct(orig_bdev, bio_op(bio), start_time);
7b26410b 1367}
99dfc43e 1368EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
7b26410b
SL
1369
1370void disk_end_io_acct(struct gendisk *disk, unsigned int op,
1371 unsigned long start_time)
1372{
8446fe92 1373 __part_end_io_acct(disk->part0, op, start_time);
7b26410b 1374}
956d510e 1375EXPORT_SYMBOL(disk_end_io_acct);
320ae51f 1376
ef71de8b
CH
1377/*
1378 * Steal bios from a request and add them to a bio list.
1379 * The request must not have been partially completed before.
1380 */
1381void blk_steal_bios(struct bio_list *list, struct request *rq)
1382{
1383 if (rq->bio) {
1384 if (list->tail)
1385 list->tail->bi_next = rq->bio;
1386 else
1387 list->head = rq->bio;
1388 list->tail = rq->biotail;
1389
1390 rq->bio = NULL;
1391 rq->biotail = NULL;
1392 }
1393
1394 rq->__data_len = 0;
1395}
1396EXPORT_SYMBOL_GPL(blk_steal_bios);
1397
3bcddeac 1398/**
2e60e022 1399 * blk_update_request - Special helper function for request stacking drivers
8ebf9756 1400 * @req: the request being processed
2a842aca 1401 * @error: block status code
8ebf9756 1402 * @nr_bytes: number of bytes to complete @req
3bcddeac
KU
1403 *
1404 * Description:
8ebf9756
RD
1405 * Ends I/O on a number of bytes attached to @req, but doesn't complete
1406 * the request structure even if @req doesn't have leftover.
1407 * If @req has leftover, sets it up for the next range of segments.
2e60e022
TH
1408 *
1409 * This special helper function is only for request stacking drivers
1410 * (e.g. request-based dm) so that they can handle partial completion.
3a211b71 1411 * Actual device drivers should use blk_mq_end_request instead.
2e60e022
TH
1412 *
1413 * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
1414 * %false return from this function.
3bcddeac 1415 *
1954e9a9
BVA
1416 * Note:
1417 * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
1418 * blk_rq_bytes() and in blk_update_request().
1419 *
3bcddeac 1420 * Return:
2e60e022
TH
1421 * %false - this request doesn't have any more data
1422 * %true - this request has more data
3bcddeac 1423 **/
2a842aca
CH
1424bool blk_update_request(struct request *req, blk_status_t error,
1425 unsigned int nr_bytes)
1da177e4 1426{
f79ea416 1427 int total_bytes;
1da177e4 1428
2a842aca 1429 trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
4a0efdc9 1430
2e60e022
TH
1431 if (!req->bio)
1432 return false;
1433
54d4e6ab
MG
1434#ifdef CONFIG_BLK_DEV_INTEGRITY
1435 if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
1436 error == BLK_STS_OK)
1437 req->q->integrity.profile->complete_fn(req, nr_bytes);
1438#endif
1439
2a842aca
CH
1440 if (unlikely(error && !blk_rq_is_passthrough(req) &&
1441 !(req->rq_flags & RQF_QUIET)))
178cc590 1442 print_req_error(req, error, __func__);
1da177e4 1443
bc58ba94 1444 blk_account_io_completion(req, nr_bytes);
d72d904a 1445
f79ea416
KO
1446 total_bytes = 0;
1447 while (req->bio) {
1448 struct bio *bio = req->bio;
4f024f37 1449 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
1da177e4 1450
9c24c10a 1451 if (bio_bytes == bio->bi_iter.bi_size)
1da177e4 1452 req->bio = bio->bi_next;
1da177e4 1453
fbbaf700
N
1454 /* Completion has already been traced */
1455 bio_clear_flag(bio, BIO_TRACE_COMPLETION);
f79ea416 1456 req_bio_endio(req, bio, bio_bytes, error);
1da177e4 1457
f79ea416
KO
1458 total_bytes += bio_bytes;
1459 nr_bytes -= bio_bytes;
1da177e4 1460
f79ea416
KO
1461 if (!nr_bytes)
1462 break;
1da177e4
LT
1463 }
1464
1465 /*
1466 * completely done
1467 */
2e60e022
TH
1468 if (!req->bio) {
1469 /*
1470 * Reset counters so that the request stacking driver
1471 * can find how many bytes remain in the request
1472 * later.
1473 */
a2dec7b3 1474 req->__data_len = 0;
2e60e022
TH
1475 return false;
1476 }
1da177e4 1477
a2dec7b3 1478 req->__data_len -= total_bytes;
2e46e8b2
TH
1479
1480 /* update sector only for requests with clear definition of sector */
57292b58 1481 if (!blk_rq_is_passthrough(req))
a2dec7b3 1482 req->__sector += total_bytes >> 9;
2e46e8b2 1483
80a761fd 1484 /* mixed attributes always follow the first bio */
e8064021 1485 if (req->rq_flags & RQF_MIXED_MERGE) {
80a761fd 1486 req->cmd_flags &= ~REQ_FAILFAST_MASK;
1eff9d32 1487 req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
80a761fd
TH
1488 }
1489
ed6565e7
CH
1490 if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
1491 /*
1492 * If total number of sectors is less than the first segment
1493 * size, something has gone terribly wrong.
1494 */
1495 if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
1496 blk_dump_rq_flags(req, "request botched");
1497 req->__data_len = blk_rq_cur_bytes(req);
1498 }
2e46e8b2 1499
ed6565e7 1500 /* recalculate the number of segments */
e9cd19c0 1501 req->nr_phys_segments = blk_recalc_rq_segments(req);
ed6565e7 1502 }
2e46e8b2 1503
2e60e022 1504 return true;
1da177e4 1505}
2e60e022 1506EXPORT_SYMBOL_GPL(blk_update_request);
1da177e4 1507
2d4dc890
IL
1508#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
1509/**
1510 * rq_flush_dcache_pages - Helper function to flush all pages in a request
1511 * @rq: the request to be flushed
1512 *
1513 * Description:
1514 * Flush all pages in @rq.
1515 */
1516void rq_flush_dcache_pages(struct request *rq)
1517{
1518 struct req_iterator iter;
7988613b 1519 struct bio_vec bvec;
2d4dc890
IL
1520
1521 rq_for_each_segment(bvec, rq, iter)
7988613b 1522 flush_dcache_page(bvec.bv_page);
2d4dc890
IL
1523}
1524EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
1525#endif
1526
ef9e3fac
KU
1527/**
1528 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
1529 * @q : the queue of the device being checked
1530 *
1531 * Description:
1532 * Check if underlying low-level drivers of a device are busy.
1533 * If the drivers want to export their busy state, they must set own
1534 * exporting function using blk_queue_lld_busy() first.
1535 *
1536 * Basically, this function is used only by request stacking drivers
1537 * to stop dispatching requests to underlying devices when underlying
1538 * devices are busy. This behavior helps more I/O merging on the queue
1539 * of the request stacking driver and prevents I/O throughput regression
1540 * on burst I/O load.
1541 *
1542 * Return:
1543 * 0 - Not busy (The request stacking driver should dispatch request)
1544 * 1 - Busy (The request stacking driver should stop dispatching request)
1545 */
1546int blk_lld_busy(struct request_queue *q)
1547{
344e9ffc 1548 if (queue_is_mq(q) && q->mq_ops->busy)
9ba20527 1549 return q->mq_ops->busy(q);
ef9e3fac
KU
1550
1551 return 0;
1552}
1553EXPORT_SYMBOL_GPL(blk_lld_busy);
1554
78d8e58a
MS
1555/**
1556 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
1557 * @rq: the clone request to be cleaned up
1558 *
1559 * Description:
1560 * Free all bios in @rq for a cloned request.
1561 */
1562void blk_rq_unprep_clone(struct request *rq)
1563{
1564 struct bio *bio;
1565
1566 while ((bio = rq->bio) != NULL) {
1567 rq->bio = bio->bi_next;
1568
1569 bio_put(bio);
1570 }
1571}
1572EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
1573
78d8e58a
MS
1574/**
1575 * blk_rq_prep_clone - Helper function to setup clone request
1576 * @rq: the request to be setup
1577 * @rq_src: original request to be cloned
1578 * @bs: bio_set that bios for clone are allocated from
1579 * @gfp_mask: memory allocation mask for bio
1580 * @bio_ctr: setup function to be called for each clone bio.
1581 * Returns %0 for success, non %0 for failure.
1582 * @data: private data to be passed to @bio_ctr
1583 *
1584 * Description:
1585 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
78d8e58a
MS
1586 * Also, pages which the original bios are pointing to are not copied
1587 * and the cloned bios just point same pages.
1588 * So cloned bios must be completed before original bios, which means
1589 * the caller must complete @rq before @rq_src.
1590 */
1591int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
1592 struct bio_set *bs, gfp_t gfp_mask,
1593 int (*bio_ctr)(struct bio *, struct bio *, void *),
1594 void *data)
1595{
1596 struct bio *bio, *bio_src;
1597
1598 if (!bs)
f4f8154a 1599 bs = &fs_bio_set;
78d8e58a
MS
1600
1601 __rq_for_each_bio(bio_src, rq_src) {
1602 bio = bio_clone_fast(bio_src, gfp_mask, bs);
1603 if (!bio)
1604 goto free_and_out;
1605
1606 if (bio_ctr && bio_ctr(bio, bio_src, data))
1607 goto free_and_out;
1608
1609 if (rq->bio) {
1610 rq->biotail->bi_next = bio;
1611 rq->biotail = bio;
93f221ae 1612 } else {
78d8e58a 1613 rq->bio = rq->biotail = bio;
93f221ae
EB
1614 }
1615 bio = NULL;
78d8e58a
MS
1616 }
1617
361301a2
GJ
1618 /* Copy attributes of the original request to the clone request. */
1619 rq->__sector = blk_rq_pos(rq_src);
1620 rq->__data_len = blk_rq_bytes(rq_src);
1621 if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
1622 rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
1623 rq->special_vec = rq_src->special_vec;
1624 }
1625 rq->nr_phys_segments = rq_src->nr_phys_segments;
1626 rq->ioprio = rq_src->ioprio;
78d8e58a 1627
93f221ae
EB
1628 if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
1629 goto free_and_out;
78d8e58a
MS
1630
1631 return 0;
1632
1633free_and_out:
1634 if (bio)
1635 bio_put(bio);
1636 blk_rq_unprep_clone(rq);
1637
1638 return -ENOMEM;
b0fd271d
KU
1639}
1640EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
1641
59c3d45e 1642int kblockd_schedule_work(struct work_struct *work)
1da177e4
LT
1643{
1644 return queue_work(kblockd_workqueue, work);
1645}
1da177e4
LT
1646EXPORT_SYMBOL(kblockd_schedule_work);
1647
818cd1cb
JA
1648int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
1649 unsigned long delay)
1650{
1651 return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
1652}
1653EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
1654
75df7136
SJ
1655/**
1656 * blk_start_plug - initialize blk_plug and track it inside the task_struct
1657 * @plug: The &struct blk_plug that needs to be initialized
1658 *
1659 * Description:
40405851
JM
1660 * blk_start_plug() indicates to the block layer an intent by the caller
1661 * to submit multiple I/O requests in a batch. The block layer may use
1662 * this hint to defer submitting I/Os from the caller until blk_finish_plug()
1663 * is called. However, the block layer may choose to submit requests
1664 * before a call to blk_finish_plug() if the number of queued I/Os
1665 * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
1666 * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if
1667 * the task schedules (see below).
1668 *
75df7136
SJ
1669 * Tracking blk_plug inside the task_struct will help with auto-flushing the
1670 * pending I/O should the task end up blocking between blk_start_plug() and
1671 * blk_finish_plug(). This is important from a performance perspective, but
1672 * also ensures that we don't deadlock. For instance, if the task is blocking
1673 * for a memory allocation, memory reclaim could end up wanting to free a
1674 * page belonging to that request that is currently residing in our private
1675 * plug. By flushing the pending I/O when the process goes to sleep, we avoid
1676 * this kind of deadlock.
1677 */
73c10101
JA
1678void blk_start_plug(struct blk_plug *plug)
1679{
1680 struct task_struct *tsk = current;
1681
dd6cf3e1
SL
1682 /*
1683 * If this is a nested plug, don't actually assign it.
1684 */
1685 if (tsk->plug)
1686 return;
1687
320ae51f 1688 INIT_LIST_HEAD(&plug->mq_list);
048c9374 1689 INIT_LIST_HEAD(&plug->cb_list);
5f0ed774 1690 plug->rq_count = 0;
ce5b009c 1691 plug->multiple_queues = false;
5a473e83 1692 plug->nowait = false;
5f0ed774 1693
73c10101 1694 /*
dd6cf3e1
SL
1695 * Store ordering should not be needed here, since a potential
1696 * preempt will imply a full memory barrier
73c10101 1697 */
dd6cf3e1 1698 tsk->plug = plug;
73c10101
JA
1699}
1700EXPORT_SYMBOL(blk_start_plug);
1701
74018dc3 1702static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
048c9374
N
1703{
1704 LIST_HEAD(callbacks);
1705
2a7d5559
SL
1706 while (!list_empty(&plug->cb_list)) {
1707 list_splice_init(&plug->cb_list, &callbacks);
048c9374 1708
2a7d5559
SL
1709 while (!list_empty(&callbacks)) {
1710 struct blk_plug_cb *cb = list_first_entry(&callbacks,
048c9374
N
1711 struct blk_plug_cb,
1712 list);
2a7d5559 1713 list_del(&cb->list);
74018dc3 1714 cb->callback(cb, from_schedule);
2a7d5559 1715 }
048c9374
N
1716 }
1717}
1718
9cbb1750
N
1719struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
1720 int size)
1721{
1722 struct blk_plug *plug = current->plug;
1723 struct blk_plug_cb *cb;
1724
1725 if (!plug)
1726 return NULL;
1727
1728 list_for_each_entry(cb, &plug->cb_list, list)
1729 if (cb->callback == unplug && cb->data == data)
1730 return cb;
1731
1732 /* Not currently on the callback list */
1733 BUG_ON(size < sizeof(*cb));
1734 cb = kzalloc(size, GFP_ATOMIC);
1735 if (cb) {
1736 cb->data = data;
1737 cb->callback = unplug;
1738 list_add(&cb->list, &plug->cb_list);
1739 }
1740 return cb;
1741}
1742EXPORT_SYMBOL(blk_check_plugged);
1743
49cac01e 1744void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
73c10101 1745{
74018dc3 1746 flush_plug_callbacks(plug, from_schedule);
320ae51f
JA
1747
1748 if (!list_empty(&plug->mq_list))
1749 blk_mq_flush_plug_list(plug, from_schedule);
73c10101 1750}
73c10101 1751
40405851
JM
1752/**
1753 * blk_finish_plug - mark the end of a batch of submitted I/O
1754 * @plug: The &struct blk_plug passed to blk_start_plug()
1755 *
1756 * Description:
1757 * Indicate that a batch of I/O submissions is complete. This function
1758 * must be paired with an initial call to blk_start_plug(). The intent
1759 * is to allow the block layer to optimize I/O submission. See the
1760 * documentation for blk_start_plug() for more information.
1761 */
73c10101
JA
1762void blk_finish_plug(struct blk_plug *plug)
1763{
dd6cf3e1
SL
1764 if (plug != current->plug)
1765 return;
f6603783 1766 blk_flush_plug_list(plug, false);
73c10101 1767
dd6cf3e1 1768 current->plug = NULL;
73c10101 1769}
88b996cd 1770EXPORT_SYMBOL(blk_finish_plug);
73c10101 1771
71ac860a
ML
1772void blk_io_schedule(void)
1773{
1774 /* Prevent hang_check timer from firing at us during very long I/O */
1775 unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
1776
1777 if (timeout)
1778 io_schedule_timeout(timeout);
1779 else
1780 io_schedule();
1781}
1782EXPORT_SYMBOL_GPL(blk_io_schedule);
1783
1da177e4
LT
1784int __init blk_dev_init(void)
1785{
ef295ecf
CH
1786 BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
1787 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
c593642c 1788 sizeof_field(struct request, cmd_flags));
ef295ecf 1789 BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
c593642c 1790 sizeof_field(struct bio, bi_opf));
9eb55b03 1791
89b90be2
TH
1792 /* used for unplugging and affects IO latency/throughput - HIGHPRI */
1793 kblockd_workqueue = alloc_workqueue("kblockd",
28747fcd 1794 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
1da177e4
LT
1795 if (!kblockd_workqueue)
1796 panic("Failed to create kblockd\n");
1797
c2789bd4 1798 blk_requestq_cachep = kmem_cache_create("request_queue",
165125e1 1799 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
1da177e4 1800
18fbda91 1801 blk_debugfs_root = debugfs_create_dir("block", NULL);
18fbda91 1802
d38ecf93 1803 return 0;
1da177e4 1804}