2 * Copyright (c) 2017 Christoph Hellwig.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 #include <linux/moduleparam.h>
17 static bool multipath
= true;
18 module_param(multipath
, bool, 0644);
19 MODULE_PARM_DESC(multipath
,
20 "turn on native support for multiple controllers per subsystem");
22 void nvme_failover_req(struct request
*req
)
24 struct nvme_ns
*ns
= req
->q
->queuedata
;
27 spin_lock_irqsave(&ns
->head
->requeue_lock
, flags
);
28 blk_steal_bios(&ns
->head
->requeue_list
, req
);
29 spin_unlock_irqrestore(&ns
->head
->requeue_lock
, flags
);
30 blk_mq_end_request(req
, 0);
32 nvme_reset_ctrl(ns
->ctrl
);
33 kblockd_schedule_work(&ns
->head
->requeue_work
);
36 bool nvme_req_needs_failover(struct request
*req
)
38 if (!(req
->cmd_flags
& REQ_NVME_MPATH
))
41 switch (nvme_req(req
)->status
& 0x7ff) {
43 * Generic command status:
45 case NVME_SC_INVALID_OPCODE
:
46 case NVME_SC_INVALID_FIELD
:
47 case NVME_SC_INVALID_NS
:
48 case NVME_SC_LBA_RANGE
:
49 case NVME_SC_CAP_EXCEEDED
:
50 case NVME_SC_RESERVATION_CONFLICT
:
54 * I/O command set specific error. Unfortunately these values are
55 * reused for fabrics commands, but those should never get here.
57 case NVME_SC_BAD_ATTRIBUTES
:
58 case NVME_SC_INVALID_PI
:
59 case NVME_SC_READ_ONLY
:
60 case NVME_SC_ONCS_NOT_SUPPORTED
:
61 WARN_ON_ONCE(nvme_req(req
)->cmd
->common
.opcode
==
62 nvme_fabrics_command
);
66 * Media and Data Integrity Errors:
68 case NVME_SC_WRITE_FAULT
:
69 case NVME_SC_READ_ERROR
:
70 case NVME_SC_GUARD_CHECK
:
71 case NVME_SC_APPTAG_CHECK
:
72 case NVME_SC_REFTAG_CHECK
:
73 case NVME_SC_COMPARE_FAILED
:
74 case NVME_SC_ACCESS_DENIED
:
75 case NVME_SC_UNWRITTEN_BLOCK
:
79 /* Everything else could be a path failure, so should be retried */
83 void nvme_kick_requeue_lists(struct nvme_ctrl
*ctrl
)
87 mutex_lock(&ctrl
->namespaces_mutex
);
88 list_for_each_entry(ns
, &ctrl
->namespaces
, list
) {
90 kblockd_schedule_work(&ns
->head
->requeue_work
);
92 mutex_unlock(&ctrl
->namespaces_mutex
);
95 static struct nvme_ns
*__nvme_find_path(struct nvme_ns_head
*head
)
99 list_for_each_entry_rcu(ns
, &head
->list
, siblings
) {
100 if (ns
->ctrl
->state
== NVME_CTRL_LIVE
) {
101 rcu_assign_pointer(head
->current_path
, ns
);
109 inline struct nvme_ns
*nvme_find_path(struct nvme_ns_head
*head
)
111 struct nvme_ns
*ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
113 if (unlikely(!ns
|| ns
->ctrl
->state
!= NVME_CTRL_LIVE
))
114 ns
= __nvme_find_path(head
);
118 static blk_qc_t
nvme_ns_head_make_request(struct request_queue
*q
,
121 struct nvme_ns_head
*head
= q
->queuedata
;
122 struct device
*dev
= disk_to_dev(head
->disk
);
124 blk_qc_t ret
= BLK_QC_T_NONE
;
127 srcu_idx
= srcu_read_lock(&head
->srcu
);
128 ns
= nvme_find_path(head
);
130 bio
->bi_disk
= ns
->disk
;
131 bio
->bi_opf
|= REQ_NVME_MPATH
;
132 ret
= direct_make_request(bio
);
133 } else if (!list_empty_careful(&head
->list
)) {
134 dev_warn_ratelimited(dev
, "no path available - requeuing I/O\n");
136 spin_lock_irq(&head
->requeue_lock
);
137 bio_list_add(&head
->requeue_list
, bio
);
138 spin_unlock_irq(&head
->requeue_lock
);
140 dev_warn_ratelimited(dev
, "no path - failing I/O\n");
142 bio
->bi_status
= BLK_STS_IOERR
;
146 srcu_read_unlock(&head
->srcu
, srcu_idx
);
150 static bool nvme_ns_head_poll(struct request_queue
*q
, blk_qc_t qc
)
152 struct nvme_ns_head
*head
= q
->queuedata
;
157 srcu_idx
= srcu_read_lock(&head
->srcu
);
158 ns
= srcu_dereference(head
->current_path
, &head
->srcu
);
159 if (likely(ns
&& ns
->ctrl
->state
== NVME_CTRL_LIVE
))
160 found
= ns
->queue
->poll_fn(q
, qc
);
161 srcu_read_unlock(&head
->srcu
, srcu_idx
);
165 static void nvme_requeue_work(struct work_struct
*work
)
167 struct nvme_ns_head
*head
=
168 container_of(work
, struct nvme_ns_head
, requeue_work
);
169 struct bio
*bio
, *next
;
171 spin_lock_irq(&head
->requeue_lock
);
172 next
= bio_list_get(&head
->requeue_list
);
173 spin_unlock_irq(&head
->requeue_lock
);
175 while ((bio
= next
) != NULL
) {
180 * Reset disk to the mpath node and resubmit to select a new
183 bio
->bi_disk
= head
->disk
;
184 generic_make_request(bio
);
188 int nvme_mpath_alloc_disk(struct nvme_ctrl
*ctrl
, struct nvme_ns_head
*head
)
190 struct request_queue
*q
;
193 bio_list_init(&head
->requeue_list
);
194 spin_lock_init(&head
->requeue_lock
);
195 INIT_WORK(&head
->requeue_work
, nvme_requeue_work
);
198 * Add a multipath node if the subsystems supports multiple controllers.
199 * We also do this for private namespaces as the namespace sharing data could
200 * change after a rescan.
202 if (!(ctrl
->subsys
->cmic
& (1 << 1)) || !multipath
)
205 q
= blk_alloc_queue_node(GFP_KERNEL
, NUMA_NO_NODE
);
209 blk_queue_make_request(q
, nvme_ns_head_make_request
);
210 q
->poll_fn
= nvme_ns_head_poll
;
211 queue_flag_set_unlocked(QUEUE_FLAG_NONROT
, q
);
212 /* set to a default value for 512 until disk is validated */
213 blk_queue_logical_block_size(q
, 512);
215 /* we need to propagate up the VMC settings */
216 if (ctrl
->vwc
& NVME_CTRL_VWC_PRESENT
)
218 blk_queue_write_cache(q
, vwc
, vwc
);
220 head
->disk
= alloc_disk(0);
222 goto out_cleanup_queue
;
223 head
->disk
->fops
= &nvme_ns_head_ops
;
224 head
->disk
->private_data
= head
;
225 head
->disk
->queue
= q
;
226 head
->disk
->flags
= GENHD_FL_EXT_DEVT
;
227 sprintf(head
->disk
->disk_name
, "nvme%dn%d",
228 ctrl
->subsys
->instance
, head
->instance
);
232 blk_cleanup_queue(q
);
237 void nvme_mpath_add_disk(struct nvme_ns_head
*head
)
241 device_add_disk(&head
->subsys
->dev
, head
->disk
);
242 if (sysfs_create_group(&disk_to_dev(head
->disk
)->kobj
,
243 &nvme_ns_id_attr_group
))
244 pr_warn("%s: failed to create sysfs group for identification\n",
245 head
->disk
->disk_name
);
248 void nvme_mpath_add_disk_links(struct nvme_ns
*ns
)
250 struct kobject
*slave_disk_kobj
, *holder_disk_kobj
;
255 slave_disk_kobj
= &disk_to_dev(ns
->disk
)->kobj
;
256 if (sysfs_create_link(ns
->head
->disk
->slave_dir
, slave_disk_kobj
,
257 kobject_name(slave_disk_kobj
)))
260 holder_disk_kobj
= &disk_to_dev(ns
->head
->disk
)->kobj
;
261 if (sysfs_create_link(ns
->disk
->part0
.holder_dir
, holder_disk_kobj
,
262 kobject_name(holder_disk_kobj
)))
263 sysfs_remove_link(ns
->head
->disk
->slave_dir
,
264 kobject_name(slave_disk_kobj
));
267 void nvme_mpath_remove_disk(struct nvme_ns_head
*head
)
271 sysfs_remove_group(&disk_to_dev(head
->disk
)->kobj
,
272 &nvme_ns_id_attr_group
);
273 del_gendisk(head
->disk
);
274 blk_set_queue_dying(head
->disk
->queue
);
275 /* make sure all pending bios are cleaned up */
276 kblockd_schedule_work(&head
->requeue_work
);
277 flush_work(&head
->requeue_work
);
278 blk_cleanup_queue(head
->disk
->queue
);
279 put_disk(head
->disk
);
282 void nvme_mpath_remove_disk_links(struct nvme_ns
*ns
)
287 sysfs_remove_link(ns
->disk
->part0
.holder_dir
,
288 kobject_name(&disk_to_dev(ns
->head
->disk
)->kobj
));
289 sysfs_remove_link(ns
->head
->disk
->slave_dir
,
290 kobject_name(&disk_to_dev(ns
->disk
)->kobj
));