1 /* SPDX-License-Identifier: GPL-2.0 */
5 #include <linux/blkdev.h>
6 #include <linux/sbitmap.h>
7 #include <linux/srcu.h>
8 #include <linux/lockdep.h>
11 struct blk_flush_queue
;
14 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
17 struct blk_mq_hw_ctx
{
19 /** @lock: Protects the dispatch list. */
22 * @dispatch: Used for requests that are ready to be
23 * dispatched to the hardware but for some reason (e.g. lack of
24 * resources) could not be sent to the hardware. As soon as the
25 * driver can send new requests, requests at this list will
26 * be sent first for a fairer dispatch.
28 struct list_head dispatch
;
30 * @state: BLK_MQ_S_* flags. Defines the state of the hw
31 * queue (active, scheduled to restart, stopped).
34 } ____cacheline_aligned_in_smp
;
37 * @run_work: Used for scheduling a hardware queue run at a later time.
39 struct delayed_work run_work
;
40 /** @cpumask: Map of available CPUs where this hctx can run. */
41 cpumask_var_t cpumask
;
43 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
44 * selection from @cpumask.
48 * @next_cpu_batch: Counter of how many works left in the batch before
49 * changing to the next CPU.
53 /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
57 * @sched_data: Pointer owned by the IO scheduler attached to a request
58 * queue. It's up to the IO scheduler how to use this pointer.
62 * @queue: Pointer to the request queue that owns this hardware context.
64 struct request_queue
*queue
;
65 /** @fq: Queue of requests that need to perform a flush operation. */
66 struct blk_flush_queue
*fq
;
69 * @driver_data: Pointer to data owned by the block driver that created
75 * @ctx_map: Bitmap for each software queue. If bit is on, there is a
76 * pending request in that software queue.
78 struct sbitmap ctx_map
;
81 * @dispatch_from: Software queue to be used when no scheduler was
84 struct blk_mq_ctx
*dispatch_from
;
86 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
87 * decide if the hw_queue is busy using Exponential Weighted Moving
90 unsigned int dispatch_busy
;
92 /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
94 /** @nr_ctx: Number of software queues. */
95 unsigned short nr_ctx
;
96 /** @ctxs: Array of software queues. */
97 struct blk_mq_ctx
**ctxs
;
99 /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
100 spinlock_t dispatch_wait_lock
;
102 * @dispatch_wait: Waitqueue to put requests when there is no tag
103 * available at the moment, to wait for another try in the future.
105 wait_queue_entry_t dispatch_wait
;
108 * @wait_index: Index of next available dispatch_wait queue to insert
114 * @tags: Tags owned by the block driver. A tag at this set is only
115 * assigned when a request is dispatched from a hardware queue.
117 struct blk_mq_tags
*tags
;
119 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
120 * scheduler associated with a request queue, a tag is assigned when
121 * that request is allocated. Else, this member is not used.
123 struct blk_mq_tags
*sched_tags
;
125 /** @queued: Number of queued requests. */
126 unsigned long queued
;
127 /** @run: Number of dispatched requests. */
129 #define BLK_MQ_MAX_DISPATCH_ORDER 7
130 /** @dispatched: Number of dispatch requests by queue. */
131 unsigned long dispatched
[BLK_MQ_MAX_DISPATCH_ORDER
];
133 /** @numa_node: NUMA node the storage adapter has been connected to. */
134 unsigned int numa_node
;
135 /** @queue_num: Index of this hardware queue. */
136 unsigned int queue_num
;
139 * @nr_active: Number of active requests. Only used when a tag set is
140 * shared across request queues.
144 * @elevator_queued: Number of queued requests on hctx.
146 atomic_t elevator_queued
;
148 /** @cpuhp_online: List to store request if CPU is going to die */
149 struct hlist_node cpuhp_online
;
150 /** @cpuhp_dead: List to store request if some CPU die. */
151 struct hlist_node cpuhp_dead
;
152 /** @kobj: Kernel object for sysfs. */
155 /** @poll_considered: Count times blk_poll() was called. */
156 unsigned long poll_considered
;
157 /** @poll_invoked: Count how many requests blk_poll() polled. */
158 unsigned long poll_invoked
;
159 /** @poll_success: Count how many polled requests were completed. */
160 unsigned long poll_success
;
162 #ifdef CONFIG_BLK_DEBUG_FS
164 * @debugfs_dir: debugfs directory for this hardware queue. Named
165 * as cpu<cpu_number>.
167 struct dentry
*debugfs_dir
;
168 /** @sched_debugfs_dir: debugfs directory for the scheduler. */
169 struct dentry
*sched_debugfs_dir
;
173 * @hctx_list: if this hctx is not in use, this is an entry in
174 * q->unused_hctx_list.
176 struct list_head hctx_list
;
179 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
180 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
181 * blk_mq_hw_ctx_size().
183 struct srcu_struct srcu
[];
187 * struct blk_mq_queue_map - Map software queues to hardware queues
188 * @mq_map: CPU ID to hardware queue index map. This is an array
189 * with nr_cpu_ids elements. Each element has a value in the range
190 * [@queue_offset, @queue_offset + @nr_queues).
191 * @nr_queues: Number of hardware queues to map CPU IDs onto.
192 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
193 * driver to map each hardware queue type (enum hctx_type) onto a distinct
194 * set of hardware queues.
196 struct blk_mq_queue_map
{
197 unsigned int *mq_map
;
198 unsigned int nr_queues
;
199 unsigned int queue_offset
;
203 * enum hctx_type - Type of hardware queue
204 * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for.
205 * @HCTX_TYPE_READ: Just for READ I/O.
206 * @HCTX_TYPE_POLL: Polled I/O of any kind.
207 * @HCTX_MAX_TYPES: Number of types of hctx.
218 * struct blk_mq_tag_set - tag set that can be shared between request queues
219 * @map: One or more ctx -> hctx mappings. One map exists for each
220 * hardware queue type (enum hctx_type) that the driver wishes
221 * to support. There are no restrictions on maps being of the
222 * same size, and it's perfectly legal to share maps between
224 * @nr_maps: Number of elements in the @map array. A number in the range
225 * [1, HCTX_MAX_TYPES].
226 * @ops: Pointers to functions that implement block driver behavior.
227 * @nr_hw_queues: Number of hardware queues supported by the block driver that
228 * owns this data structure.
229 * @queue_depth: Number of tags per hardware queue, reserved tags included.
230 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
232 * @cmd_size: Number of additional bytes to allocate per request. The block
233 * driver owns these additional bytes.
234 * @numa_node: NUMA node the storage adapter has been connected to.
235 * @timeout: Request processing timeout in jiffies.
236 * @flags: Zero or more BLK_MQ_F_* flags.
237 * @driver_data: Pointer to data owned by the block driver that created this
239 * @active_queues_shared_sbitmap:
240 * number of active request queues per tag set.
241 * @__bitmap_tags: A shared tags sbitmap, used over all hctx's
243 * A shared reserved tags sbitmap, used over all hctx's
244 * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues
246 * @tag_list_lock: Serializes tag_list accesses.
247 * @tag_list: List of the request queues that use this tag set. See also
248 * request_queue.tag_set_list.
250 struct blk_mq_tag_set
{
251 struct blk_mq_queue_map map
[HCTX_MAX_TYPES
];
252 unsigned int nr_maps
;
253 const struct blk_mq_ops
*ops
;
254 unsigned int nr_hw_queues
;
255 unsigned int queue_depth
;
256 unsigned int reserved_tags
;
257 unsigned int cmd_size
;
259 unsigned int timeout
;
262 atomic_t active_queues_shared_sbitmap
;
264 struct sbitmap_queue __bitmap_tags
;
265 struct sbitmap_queue __breserved_tags
;
266 struct blk_mq_tags
**tags
;
268 struct mutex tag_list_lock
;
269 struct list_head tag_list
;
273 * struct blk_mq_queue_data - Data about a request inserted in a queue
275 * @rq: Request pointer.
276 * @last: If it is the last request in the queue.
278 struct blk_mq_queue_data
{
283 typedef bool (busy_iter_fn
)(struct blk_mq_hw_ctx
*, struct request
*, void *,
285 typedef bool (busy_tag_iter_fn
)(struct request
*, void *, bool);
288 * struct blk_mq_ops - Callback functions that implements block driver
293 * @queue_rq: Queue a new request from block IO.
295 blk_status_t (*queue_rq
)(struct blk_mq_hw_ctx
*,
296 const struct blk_mq_queue_data
*);
299 * @commit_rqs: If a driver uses bd->last to judge when to submit
300 * requests to hardware, it must define this function. In case of errors
301 * that make us stop issuing further requests, this hook serves the
302 * purpose of kicking the hardware (which the last request otherwise
305 void (*commit_rqs
)(struct blk_mq_hw_ctx
*);
308 * @get_budget: Reserve budget before queue request, once .queue_rq is
309 * run, it is driver's responsibility to release the
310 * reserved budget. Also we have to handle failure case
311 * of .get_budget for avoiding I/O deadlock.
313 bool (*get_budget
)(struct request_queue
*);
316 * @put_budget: Release the reserved budget.
318 void (*put_budget
)(struct request_queue
*);
321 * @timeout: Called on request timeout.
323 enum blk_eh_timer_return (*timeout
)(struct request
*, bool);
326 * @poll: Called to poll for completion of a specific tag.
328 int (*poll
)(struct blk_mq_hw_ctx
*);
331 * @complete: Mark the request as complete.
333 void (*complete
)(struct request
*);
336 * @init_hctx: Called when the block layer side of a hardware queue has
337 * been set up, allowing the driver to allocate/init matching
340 int (*init_hctx
)(struct blk_mq_hw_ctx
*, void *, unsigned int);
342 * @exit_hctx: Ditto for exit/teardown.
344 void (*exit_hctx
)(struct blk_mq_hw_ctx
*, unsigned int);
347 * @init_request: Called for every command allocated by the block layer
348 * to allow the driver to set up driver specific data.
350 * Tag greater than or equal to queue_depth is for setting up
353 int (*init_request
)(struct blk_mq_tag_set
*set
, struct request
*,
354 unsigned int, unsigned int);
356 * @exit_request: Ditto for exit/teardown.
358 void (*exit_request
)(struct blk_mq_tag_set
*set
, struct request
*,
362 * @initialize_rq_fn: Called from inside blk_get_request().
364 void (*initialize_rq_fn
)(struct request
*rq
);
367 * @cleanup_rq: Called before freeing one request which isn't completed
368 * yet, and usually for freeing the driver private data.
370 void (*cleanup_rq
)(struct request
*);
373 * @busy: If set, returns whether or not this queue currently is busy.
375 bool (*busy
)(struct request_queue
*);
378 * @map_queues: This allows drivers specify their own queue mapping by
379 * overriding the setup-time function that builds the mq_map.
381 int (*map_queues
)(struct blk_mq_tag_set
*set
);
383 #ifdef CONFIG_BLK_DEBUG_FS
385 * @show_rq: Used by the debugfs implementation to show driver-specific
386 * information about a request.
388 void (*show_rq
)(struct seq_file
*m
, struct request
*rq
);
393 BLK_MQ_F_SHOULD_MERGE
= 1 << 0,
394 BLK_MQ_F_TAG_QUEUE_SHARED
= 1 << 1,
396 * Set when this device requires underlying blk-mq device for
399 BLK_MQ_F_STACKING
= 1 << 2,
400 BLK_MQ_F_TAG_HCTX_SHARED
= 1 << 3,
401 BLK_MQ_F_BLOCKING
= 1 << 5,
402 BLK_MQ_F_NO_SCHED
= 1 << 6,
403 BLK_MQ_F_ALLOC_POLICY_START_BIT
= 8,
404 BLK_MQ_F_ALLOC_POLICY_BITS
= 1,
406 BLK_MQ_S_STOPPED
= 0,
407 BLK_MQ_S_TAG_ACTIVE
= 1,
408 BLK_MQ_S_SCHED_RESTART
= 2,
410 /* hw queue is inactive after all its CPUs become offline */
411 BLK_MQ_S_INACTIVE
= 3,
413 BLK_MQ_MAX_DEPTH
= 10240,
415 BLK_MQ_CPU_WORK_BATCH
= 8,
417 #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \
418 ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \
419 ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1))
420 #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \
421 ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \
422 << BLK_MQ_F_ALLOC_POLICY_START_BIT)
424 struct request_queue
*blk_mq_init_queue(struct blk_mq_tag_set
*);
425 struct request_queue
*blk_mq_init_queue_data(struct blk_mq_tag_set
*set
,
427 struct request_queue
*blk_mq_init_allocated_queue(struct blk_mq_tag_set
*set
,
428 struct request_queue
*q
,
430 struct request_queue
*blk_mq_init_sq_queue(struct blk_mq_tag_set
*set
,
431 const struct blk_mq_ops
*ops
,
432 unsigned int queue_depth
,
433 unsigned int set_flags
);
434 void blk_mq_unregister_dev(struct device
*, struct request_queue
*);
436 int blk_mq_alloc_tag_set(struct blk_mq_tag_set
*set
);
437 void blk_mq_free_tag_set(struct blk_mq_tag_set
*set
);
439 void blk_mq_flush_plug_list(struct blk_plug
*plug
, bool from_schedule
);
441 void blk_mq_free_request(struct request
*rq
);
443 bool blk_mq_queue_inflight(struct request_queue
*q
);
446 /* return when out of requests */
447 BLK_MQ_REQ_NOWAIT
= (__force blk_mq_req_flags_t
)(1 << 0),
448 /* allocate from reserved pool */
449 BLK_MQ_REQ_RESERVED
= (__force blk_mq_req_flags_t
)(1 << 1),
451 BLK_MQ_REQ_PM
= (__force blk_mq_req_flags_t
)(1 << 2),
454 struct request
*blk_mq_alloc_request(struct request_queue
*q
, unsigned int op
,
455 blk_mq_req_flags_t flags
);
456 struct request
*blk_mq_alloc_request_hctx(struct request_queue
*q
,
457 unsigned int op
, blk_mq_req_flags_t flags
,
458 unsigned int hctx_idx
);
459 struct request
*blk_mq_tag_to_rq(struct blk_mq_tags
*tags
, unsigned int tag
);
462 BLK_MQ_UNIQUE_TAG_BITS
= 16,
463 BLK_MQ_UNIQUE_TAG_MASK
= (1 << BLK_MQ_UNIQUE_TAG_BITS
) - 1,
466 u32
blk_mq_unique_tag(struct request
*rq
);
468 static inline u16
blk_mq_unique_tag_to_hwq(u32 unique_tag
)
470 return unique_tag
>> BLK_MQ_UNIQUE_TAG_BITS
;
473 static inline u16
blk_mq_unique_tag_to_tag(u32 unique_tag
)
475 return unique_tag
& BLK_MQ_UNIQUE_TAG_MASK
;
479 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
480 * @rq: target request.
482 static inline enum mq_rq_state
blk_mq_rq_state(struct request
*rq
)
484 return READ_ONCE(rq
->state
);
487 static inline int blk_mq_request_started(struct request
*rq
)
489 return blk_mq_rq_state(rq
) != MQ_RQ_IDLE
;
492 static inline int blk_mq_request_completed(struct request
*rq
)
494 return blk_mq_rq_state(rq
) == MQ_RQ_COMPLETE
;
497 void blk_mq_start_request(struct request
*rq
);
498 void blk_mq_end_request(struct request
*rq
, blk_status_t error
);
499 void __blk_mq_end_request(struct request
*rq
, blk_status_t error
);
501 void blk_mq_requeue_request(struct request
*rq
, bool kick_requeue_list
);
502 void blk_mq_kick_requeue_list(struct request_queue
*q
);
503 void blk_mq_delay_kick_requeue_list(struct request_queue
*q
, unsigned long msecs
);
504 void blk_mq_complete_request(struct request
*rq
);
505 bool blk_mq_complete_request_remote(struct request
*rq
);
506 bool blk_mq_queue_stopped(struct request_queue
*q
);
507 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx
*hctx
);
508 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx
*hctx
);
509 void blk_mq_stop_hw_queues(struct request_queue
*q
);
510 void blk_mq_start_hw_queues(struct request_queue
*q
);
511 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx
*hctx
, bool async
);
512 void blk_mq_start_stopped_hw_queues(struct request_queue
*q
, bool async
);
513 void blk_mq_quiesce_queue(struct request_queue
*q
);
514 void blk_mq_unquiesce_queue(struct request_queue
*q
);
515 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx
*hctx
, unsigned long msecs
);
516 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx
*hctx
, bool async
);
517 void blk_mq_run_hw_queues(struct request_queue
*q
, bool async
);
518 void blk_mq_delay_run_hw_queues(struct request_queue
*q
, unsigned long msecs
);
519 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set
*tagset
,
520 busy_tag_iter_fn
*fn
, void *priv
);
521 void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set
*tagset
);
522 void blk_mq_freeze_queue(struct request_queue
*q
);
523 void blk_mq_unfreeze_queue(struct request_queue
*q
);
524 void blk_freeze_queue_start(struct request_queue
*q
);
525 void blk_mq_freeze_queue_wait(struct request_queue
*q
);
526 int blk_mq_freeze_queue_wait_timeout(struct request_queue
*q
,
527 unsigned long timeout
);
529 int blk_mq_map_queues(struct blk_mq_queue_map
*qmap
);
530 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set
*set
, int nr_hw_queues
);
532 void blk_mq_quiesce_queue_nowait(struct request_queue
*q
);
534 unsigned int blk_mq_rq_cpu(struct request
*rq
);
536 bool __blk_should_fake_timeout(struct request_queue
*q
);
537 static inline bool blk_should_fake_timeout(struct request_queue
*q
)
539 if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT
) &&
540 test_bit(QUEUE_FLAG_FAIL_IO
, &q
->queue_flags
))
541 return __blk_should_fake_timeout(q
);
546 * blk_mq_rq_from_pdu - cast a PDU to a request
547 * @pdu: the PDU (Protocol Data Unit) to be casted
551 * Driver command data is immediately after the request. So subtract request
552 * size to get back to the original request.
554 static inline struct request
*blk_mq_rq_from_pdu(void *pdu
)
556 return pdu
- sizeof(struct request
);
560 * blk_mq_rq_to_pdu - cast a request to a PDU
561 * @rq: the request to be casted
563 * Return: pointer to the PDU
565 * Driver command data is immediately after the request. So add request to get
568 static inline void *blk_mq_rq_to_pdu(struct request
*rq
)
573 #define queue_for_each_hw_ctx(q, hctx, i) \
574 for ((i) = 0; (i) < (q)->nr_hw_queues && \
575 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
577 #define hctx_for_each_ctx(hctx, ctx, i) \
578 for ((i) = 0; (i) < (hctx)->nr_ctx && \
579 ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)
581 static inline blk_qc_t
request_to_qc_t(struct blk_mq_hw_ctx
*hctx
,
585 return rq
->tag
| (hctx
->queue_num
<< BLK_QC_T_SHIFT
);
587 return rq
->internal_tag
| (hctx
->queue_num
<< BLK_QC_T_SHIFT
) |
591 static inline void blk_mq_cleanup_rq(struct request
*rq
)
593 if (rq
->q
->mq_ops
->cleanup_rq
)
594 rq
->q
->mq_ops
->cleanup_rq(rq
);
597 static inline void blk_rq_bio_prep(struct request
*rq
, struct bio
*bio
,
598 unsigned int nr_segs
)
600 rq
->nr_phys_segments
= nr_segs
;
601 rq
->__data_len
= bio
->bi_iter
.bi_size
;
602 rq
->bio
= rq
->biotail
= bio
;
603 rq
->ioprio
= bio_prio(bio
);
606 rq
->rq_disk
= bio
->bi_disk
;
609 blk_qc_t
blk_mq_submit_bio(struct bio
*bio
);
610 void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx
*hctx
,
611 struct lock_class_key
*key
);