]>
Commit | Line | Data |
---|---|---|
32acab31 CH |
1 | /* |
2 | * Copyright (c) 2017 Christoph Hellwig. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify it | |
5 | * under the terms and conditions of the GNU General Public License, | |
6 | * version 2, as published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope it will be useful, but WITHOUT | |
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
11 | * more details. | |
12 | */ | |
13 | ||
14 | #include <linux/moduleparam.h> | |
15 | #include "nvme.h" | |
16 | ||
17 | static bool multipath = true; | |
18 | module_param(multipath, bool, 0644); | |
19 | MODULE_PARM_DESC(multipath, | |
20 | "turn on native support for multiple controllers per subsystem"); | |
21 | ||
22 | void nvme_failover_req(struct request *req) | |
23 | { | |
24 | struct nvme_ns *ns = req->q->queuedata; | |
25 | unsigned long flags; | |
26 | ||
27 | spin_lock_irqsave(&ns->head->requeue_lock, flags); | |
28 | blk_steal_bios(&ns->head->requeue_list, req); | |
29 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); | |
30 | blk_mq_end_request(req, 0); | |
31 | ||
32 | nvme_reset_ctrl(ns->ctrl); | |
33 | kblockd_schedule_work(&ns->head->requeue_work); | |
34 | } | |
35 | ||
36 | bool nvme_req_needs_failover(struct request *req) | |
37 | { | |
38 | if (!(req->cmd_flags & REQ_NVME_MPATH)) | |
39 | return false; | |
40 | ||
41 | switch (nvme_req(req)->status & 0x7ff) { | |
42 | /* | |
43 | * Generic command status: | |
44 | */ | |
45 | case NVME_SC_INVALID_OPCODE: | |
46 | case NVME_SC_INVALID_FIELD: | |
47 | case NVME_SC_INVALID_NS: | |
48 | case NVME_SC_LBA_RANGE: | |
49 | case NVME_SC_CAP_EXCEEDED: | |
50 | case NVME_SC_RESERVATION_CONFLICT: | |
51 | return false; | |
52 | ||
53 | /* | |
54 | * I/O command set specific error. Unfortunately these values are | |
55 | * reused for fabrics commands, but those should never get here. | |
56 | */ | |
57 | case NVME_SC_BAD_ATTRIBUTES: | |
58 | case NVME_SC_INVALID_PI: | |
59 | case NVME_SC_READ_ONLY: | |
60 | case NVME_SC_ONCS_NOT_SUPPORTED: | |
61 | WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == | |
62 | nvme_fabrics_command); | |
63 | return false; | |
64 | ||
65 | /* | |
66 | * Media and Data Integrity Errors: | |
67 | */ | |
68 | case NVME_SC_WRITE_FAULT: | |
69 | case NVME_SC_READ_ERROR: | |
70 | case NVME_SC_GUARD_CHECK: | |
71 | case NVME_SC_APPTAG_CHECK: | |
72 | case NVME_SC_REFTAG_CHECK: | |
73 | case NVME_SC_COMPARE_FAILED: | |
74 | case NVME_SC_ACCESS_DENIED: | |
75 | case NVME_SC_UNWRITTEN_BLOCK: | |
76 | return false; | |
77 | } | |
78 | ||
79 | /* Everything else could be a path failure, so should be retried */ | |
80 | return true; | |
81 | } | |
82 | ||
83 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) | |
84 | { | |
85 | struct nvme_ns *ns; | |
86 | ||
87 | mutex_lock(&ctrl->namespaces_mutex); | |
88 | list_for_each_entry(ns, &ctrl->namespaces, list) { | |
89 | if (ns->head->disk) | |
90 | kblockd_schedule_work(&ns->head->requeue_work); | |
91 | } | |
92 | mutex_unlock(&ctrl->namespaces_mutex); | |
93 | } | |
94 | ||
95 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) | |
96 | { | |
97 | struct nvme_ns *ns; | |
98 | ||
99 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
100 | if (ns->ctrl->state == NVME_CTRL_LIVE) { | |
101 | rcu_assign_pointer(head->current_path, ns); | |
102 | return ns; | |
103 | } | |
104 | } | |
105 | ||
106 | return NULL; | |
107 | } | |
108 | ||
109 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) | |
110 | { | |
111 | struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); | |
112 | ||
113 | if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) | |
114 | ns = __nvme_find_path(head); | |
115 | return ns; | |
116 | } | |
117 | ||
118 | static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, | |
119 | struct bio *bio) | |
120 | { | |
121 | struct nvme_ns_head *head = q->queuedata; | |
122 | struct device *dev = disk_to_dev(head->disk); | |
123 | struct nvme_ns *ns; | |
124 | blk_qc_t ret = BLK_QC_T_NONE; | |
125 | int srcu_idx; | |
126 | ||
127 | srcu_idx = srcu_read_lock(&head->srcu); | |
128 | ns = nvme_find_path(head); | |
129 | if (likely(ns)) { | |
130 | bio->bi_disk = ns->disk; | |
131 | bio->bi_opf |= REQ_NVME_MPATH; | |
132 | ret = direct_make_request(bio); | |
133 | } else if (!list_empty_careful(&head->list)) { | |
89c4aff6 | 134 | dev_warn_ratelimited(dev, "no path available - requeuing I/O\n"); |
32acab31 CH |
135 | |
136 | spin_lock_irq(&head->requeue_lock); | |
137 | bio_list_add(&head->requeue_list, bio); | |
138 | spin_unlock_irq(&head->requeue_lock); | |
139 | } else { | |
140 | dev_warn_ratelimited(dev, "no path - failing I/O\n"); | |
141 | ||
142 | bio->bi_status = BLK_STS_IOERR; | |
143 | bio_endio(bio); | |
144 | } | |
145 | ||
146 | srcu_read_unlock(&head->srcu, srcu_idx); | |
147 | return ret; | |
148 | } | |
149 | ||
150 | static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) | |
151 | { | |
152 | struct nvme_ns_head *head = q->queuedata; | |
153 | struct nvme_ns *ns; | |
154 | bool found = false; | |
155 | int srcu_idx; | |
156 | ||
157 | srcu_idx = srcu_read_lock(&head->srcu); | |
158 | ns = srcu_dereference(head->current_path, &head->srcu); | |
159 | if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) | |
160 | found = ns->queue->poll_fn(q, qc); | |
161 | srcu_read_unlock(&head->srcu, srcu_idx); | |
162 | return found; | |
163 | } | |
164 | ||
165 | static void nvme_requeue_work(struct work_struct *work) | |
166 | { | |
167 | struct nvme_ns_head *head = | |
168 | container_of(work, struct nvme_ns_head, requeue_work); | |
169 | struct bio *bio, *next; | |
170 | ||
171 | spin_lock_irq(&head->requeue_lock); | |
172 | next = bio_list_get(&head->requeue_list); | |
173 | spin_unlock_irq(&head->requeue_lock); | |
174 | ||
175 | while ((bio = next) != NULL) { | |
176 | next = bio->bi_next; | |
177 | bio->bi_next = NULL; | |
178 | ||
179 | /* | |
180 | * Reset disk to the mpath node and resubmit to select a new | |
181 | * path. | |
182 | */ | |
183 | bio->bi_disk = head->disk; | |
184 | generic_make_request(bio); | |
185 | } | |
186 | } | |
187 | ||
188 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) | |
189 | { | |
190 | struct request_queue *q; | |
191 | bool vwc = false; | |
192 | ||
193 | bio_list_init(&head->requeue_list); | |
194 | spin_lock_init(&head->requeue_lock); | |
195 | INIT_WORK(&head->requeue_work, nvme_requeue_work); | |
196 | ||
197 | /* | |
198 | * Add a multipath node if the subsystems supports multiple controllers. | |
199 | * We also do this for private namespaces as the namespace sharing data could | |
200 | * change after a rescan. | |
201 | */ | |
202 | if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) | |
203 | return 0; | |
204 | ||
205 | q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); | |
206 | if (!q) | |
207 | goto out; | |
208 | q->queuedata = head; | |
209 | blk_queue_make_request(q, nvme_ns_head_make_request); | |
210 | q->poll_fn = nvme_ns_head_poll; | |
211 | queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); | |
212 | /* set to a default value for 512 until disk is validated */ | |
213 | blk_queue_logical_block_size(q, 512); | |
214 | ||
215 | /* we need to propagate up the VMC settings */ | |
216 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) | |
217 | vwc = true; | |
218 | blk_queue_write_cache(q, vwc, vwc); | |
219 | ||
220 | head->disk = alloc_disk(0); | |
221 | if (!head->disk) | |
222 | goto out_cleanup_queue; | |
223 | head->disk->fops = &nvme_ns_head_ops; | |
224 | head->disk->private_data = head; | |
225 | head->disk->queue = q; | |
226 | head->disk->flags = GENHD_FL_EXT_DEVT; | |
227 | sprintf(head->disk->disk_name, "nvme%dn%d", | |
228 | ctrl->subsys->instance, head->instance); | |
229 | return 0; | |
230 | ||
231 | out_cleanup_queue: | |
232 | blk_cleanup_queue(q); | |
233 | out: | |
234 | return -ENOMEM; | |
235 | } | |
236 | ||
237 | void nvme_mpath_add_disk(struct nvme_ns_head *head) | |
238 | { | |
239 | if (!head->disk) | |
240 | return; | |
241 | device_add_disk(&head->subsys->dev, head->disk); | |
5b85b826 CH |
242 | if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, |
243 | &nvme_ns_id_attr_group)) | |
244 | pr_warn("%s: failed to create sysfs group for identification\n", | |
245 | head->disk->disk_name); | |
32acab31 CH |
246 | } |
247 | ||
e9a48034 HR |
248 | void nvme_mpath_add_disk_links(struct nvme_ns *ns) |
249 | { | |
250 | struct kobject *slave_disk_kobj, *holder_disk_kobj; | |
251 | ||
252 | if (!ns->head->disk) | |
253 | return; | |
254 | ||
255 | slave_disk_kobj = &disk_to_dev(ns->disk)->kobj; | |
256 | if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj, | |
257 | kobject_name(slave_disk_kobj))) | |
258 | return; | |
259 | ||
260 | holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj; | |
261 | if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj, | |
262 | kobject_name(holder_disk_kobj))) | |
263 | sysfs_remove_link(ns->head->disk->slave_dir, | |
264 | kobject_name(slave_disk_kobj)); | |
265 | } | |
266 | ||
32acab31 CH |
267 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) |
268 | { | |
269 | if (!head->disk) | |
270 | return; | |
5b85b826 CH |
271 | sysfs_remove_group(&disk_to_dev(head->disk)->kobj, |
272 | &nvme_ns_id_attr_group); | |
32acab31 CH |
273 | del_gendisk(head->disk); |
274 | blk_set_queue_dying(head->disk->queue); | |
275 | /* make sure all pending bios are cleaned up */ | |
276 | kblockd_schedule_work(&head->requeue_work); | |
277 | flush_work(&head->requeue_work); | |
278 | blk_cleanup_queue(head->disk->queue); | |
279 | put_disk(head->disk); | |
280 | } | |
e9a48034 HR |
281 | |
282 | void nvme_mpath_remove_disk_links(struct nvme_ns *ns) | |
283 | { | |
284 | if (!ns->head->disk) | |
285 | return; | |
286 | ||
287 | sysfs_remove_link(ns->disk->part0.holder_dir, | |
288 | kobject_name(&disk_to_dev(ns->head->disk)->kobj)); | |
289 | sysfs_remove_link(ns->head->disk->slave_dir, | |
290 | kobject_name(&disk_to_dev(ns->disk)->kobj)); | |
291 | } |