]>
Commit | Line | Data |
---|---|---|
bc50ad75 | 1 | // SPDX-License-Identifier: GPL-2.0 |
32acab31 | 2 | /* |
0d0b660f | 3 | * Copyright (c) 2017-2018 Christoph Hellwig. |
32acab31 CH |
4 | */ |
5 | ||
b2ce4d90 | 6 | #include <linux/backing-dev.h> |
32acab31 | 7 | #include <linux/moduleparam.h> |
2796b569 | 8 | #include <trace/events/block.h> |
32acab31 CH |
9 | #include "nvme.h" |
10 | ||
11 | static bool multipath = true; | |
5cadde80 | 12 | module_param(multipath, bool, 0444); |
32acab31 CH |
13 | MODULE_PARM_DESC(multipath, |
14 | "turn on native support for multiple controllers per subsystem"); | |
15 | ||
b9156dae SG |
16 | void nvme_mpath_unfreeze(struct nvme_subsystem *subsys) |
17 | { | |
18 | struct nvme_ns_head *h; | |
19 | ||
20 | lockdep_assert_held(&subsys->lock); | |
21 | list_for_each_entry(h, &subsys->nsheads, entry) | |
22 | if (h->disk) | |
23 | blk_mq_unfreeze_queue(h->disk->queue); | |
24 | } | |
25 | ||
26 | void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) | |
27 | { | |
28 | struct nvme_ns_head *h; | |
29 | ||
30 | lockdep_assert_held(&subsys->lock); | |
31 | list_for_each_entry(h, &subsys->nsheads, entry) | |
32 | if (h->disk) | |
33 | blk_mq_freeze_queue_wait(h->disk->queue); | |
34 | } | |
35 | ||
36 | void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) | |
37 | { | |
38 | struct nvme_ns_head *h; | |
39 | ||
40 | lockdep_assert_held(&subsys->lock); | |
41 | list_for_each_entry(h, &subsys->nsheads, entry) | |
42 | if (h->disk) | |
43 | blk_freeze_queue_start(h->disk->queue); | |
44 | } | |
45 | ||
a785dbcc KB |
46 | /* |
47 | * If multipathing is enabled we need to always use the subsystem instance | |
48 | * number for numbering our devices to avoid conflicts between subsystems that | |
49 | * have multiple controllers and thus use the multipath-aware subsystem node | |
50 | * and those that have a single controller and use the controller node | |
51 | * directly. | |
52 | */ | |
9953ab0c | 53 | bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) |
a785dbcc | 54 | { |
9953ab0c CH |
55 | if (!multipath) |
56 | return false; | |
57 | if (!ns->head->disk) { | |
58 | sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, | |
59 | ns->head->instance); | |
60 | return true; | |
a785dbcc | 61 | } |
9953ab0c CH |
62 | sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, |
63 | ns->ctrl->instance, ns->head->instance); | |
64 | *flags = GENHD_FL_HIDDEN; | |
65 | return true; | |
a785dbcc KB |
66 | } |
67 | ||
5ddaabe8 | 68 | void nvme_failover_req(struct request *req) |
32acab31 CH |
69 | { |
70 | struct nvme_ns *ns = req->q->queuedata; | |
5ddaabe8 | 71 | u16 status = nvme_req(req)->status & 0x7ff; |
32acab31 | 72 | unsigned long flags; |
ce86dad2 | 73 | struct bio *bio; |
32acab31 | 74 | |
5ddaabe8 CH |
75 | nvme_mpath_clear_current_path(ns); |
76 | ||
77 | /* | |
78 | * If we got back an ANA error, we know the controller is alive but not | |
79 | * ready to serve this namespace. Kick of a re-read of the ANA | |
80 | * information page, and just try any other available path for now. | |
81 | */ | |
82 | if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { | |
83 | set_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
84 | queue_work(nvme_wq, &ns->ctrl->ana_work); | |
0d0b660f CH |
85 | } |
86 | ||
764e9332 | 87 | spin_lock_irqsave(&ns->head->requeue_lock, flags); |
ce86dad2 DW |
88 | for (bio = req->bio; bio; bio = bio->bi_next) |
89 | bio_set_dev(bio, ns->head->disk->part0); | |
764e9332 JM |
90 | blk_steal_bios(&ns->head->requeue_list, req); |
91 | spin_unlock_irqrestore(&ns->head->requeue_lock, flags); | |
764e9332 | 92 | |
5ddaabe8 | 93 | blk_mq_end_request(req, 0); |
32acab31 CH |
94 | kblockd_schedule_work(&ns->head->requeue_work); |
95 | } | |
96 | ||
32acab31 CH |
97 | void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) |
98 | { | |
99 | struct nvme_ns *ns; | |
100 | ||
765cc031 | 101 | down_read(&ctrl->namespaces_rwsem); |
32acab31 CH |
102 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
103 | if (ns->head->disk) | |
104 | kblockd_schedule_work(&ns->head->requeue_work); | |
105 | } | |
765cc031 | 106 | up_read(&ctrl->namespaces_rwsem); |
32acab31 CH |
107 | } |
108 | ||
0d0b660f CH |
109 | static const char *nvme_ana_state_names[] = { |
110 | [0] = "invalid state", | |
111 | [NVME_ANA_OPTIMIZED] = "optimized", | |
112 | [NVME_ANA_NONOPTIMIZED] = "non-optimized", | |
113 | [NVME_ANA_INACCESSIBLE] = "inaccessible", | |
114 | [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", | |
115 | [NVME_ANA_CHANGE] = "change", | |
116 | }; | |
117 | ||
0157ec8d | 118 | bool nvme_mpath_clear_current_path(struct nvme_ns *ns) |
32acab31 | 119 | { |
f3334447 | 120 | struct nvme_ns_head *head = ns->head; |
0157ec8d | 121 | bool changed = false; |
f3334447 CH |
122 | int node; |
123 | ||
124 | if (!head) | |
0157ec8d | 125 | goto out; |
f3334447 CH |
126 | |
127 | for_each_node(node) { | |
0157ec8d | 128 | if (ns == rcu_access_pointer(head->current_path[node])) { |
f3334447 | 129 | rcu_assign_pointer(head->current_path[node], NULL); |
0157ec8d SG |
130 | changed = true; |
131 | } | |
f3334447 | 132 | } |
0157ec8d SG |
133 | out: |
134 | return changed; | |
135 | } | |
136 | ||
137 | void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) | |
138 | { | |
139 | struct nvme_ns *ns; | |
140 | ||
763303a8 | 141 | down_read(&ctrl->namespaces_rwsem); |
e6dda568 HR |
142 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
143 | nvme_mpath_clear_current_path(ns); | |
144 | kblockd_schedule_work(&ns->head->requeue_work); | |
145 | } | |
763303a8 | 146 | up_read(&ctrl->namespaces_rwsem); |
f3334447 CH |
147 | } |
148 | ||
e7d65803 HR |
149 | void nvme_mpath_revalidate_paths(struct nvme_ns *ns) |
150 | { | |
151 | struct nvme_ns_head *head = ns->head; | |
152 | sector_t capacity = get_capacity(head->disk); | |
153 | int node; | |
154 | ||
155 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
156 | if (capacity != get_capacity(ns->disk)) | |
157 | clear_bit(NVME_NS_READY, &ns->flags); | |
158 | } | |
159 | ||
160 | for_each_node(node) | |
161 | rcu_assign_pointer(head->current_path[node], NULL); | |
162 | } | |
163 | ||
ca7ae5c9 HR |
164 | static bool nvme_path_is_disabled(struct nvme_ns *ns) |
165 | { | |
ecca390e SG |
166 | /* |
167 | * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should | |
168 | * still be able to complete assuming that the controller is connected. | |
169 | * Otherwise it will fail immediately and return to the requeue list. | |
170 | */ | |
171 | if (ns->ctrl->state != NVME_CTRL_LIVE && | |
172 | ns->ctrl->state != NVME_CTRL_DELETING) | |
173 | return true; | |
174 | if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || | |
e7d65803 | 175 | !test_bit(NVME_NS_READY, &ns->flags)) |
ecca390e SG |
176 | return true; |
177 | return false; | |
ca7ae5c9 HR |
178 | } |
179 | ||
f3334447 CH |
180 | static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) |
181 | { | |
182 | int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; | |
183 | struct nvme_ns *found = NULL, *fallback = NULL, *ns; | |
32acab31 CH |
184 | |
185 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
ca7ae5c9 | 186 | if (nvme_path_is_disabled(ns)) |
0d0b660f | 187 | continue; |
f3334447 | 188 | |
75c10e73 HR |
189 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) |
190 | distance = node_distance(node, ns->ctrl->numa_node); | |
191 | else | |
192 | distance = LOCAL_DISTANCE; | |
f3334447 | 193 | |
0d0b660f CH |
194 | switch (ns->ana_state) { |
195 | case NVME_ANA_OPTIMIZED: | |
f3334447 CH |
196 | if (distance < found_distance) { |
197 | found_distance = distance; | |
198 | found = ns; | |
199 | } | |
200 | break; | |
0d0b660f | 201 | case NVME_ANA_NONOPTIMIZED: |
f3334447 CH |
202 | if (distance < fallback_distance) { |
203 | fallback_distance = distance; | |
204 | fallback = ns; | |
205 | } | |
0d0b660f CH |
206 | break; |
207 | default: | |
208 | break; | |
32acab31 CH |
209 | } |
210 | } | |
211 | ||
f3334447 CH |
212 | if (!found) |
213 | found = fallback; | |
214 | if (found) | |
215 | rcu_assign_pointer(head->current_path[node], found); | |
216 | return found; | |
0d0b660f CH |
217 | } |
218 | ||
75c10e73 HR |
219 | static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, |
220 | struct nvme_ns *ns) | |
221 | { | |
222 | ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, | |
223 | siblings); | |
224 | if (ns) | |
225 | return ns; | |
226 | return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); | |
227 | } | |
228 | ||
229 | static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, | |
230 | int node, struct nvme_ns *old) | |
231 | { | |
e398863b | 232 | struct nvme_ns *ns, *found = NULL; |
75c10e73 | 233 | |
2032d074 HR |
234 | if (list_is_singular(&head->list)) { |
235 | if (nvme_path_is_disabled(old)) | |
236 | return NULL; | |
75c10e73 | 237 | return old; |
2032d074 | 238 | } |
75c10e73 HR |
239 | |
240 | for (ns = nvme_next_ns(head, old); | |
d1bcf006 | 241 | ns && ns != old; |
75c10e73 | 242 | ns = nvme_next_ns(head, ns)) { |
ca7ae5c9 | 243 | if (nvme_path_is_disabled(ns)) |
75c10e73 HR |
244 | continue; |
245 | ||
246 | if (ns->ana_state == NVME_ANA_OPTIMIZED) { | |
247 | found = ns; | |
248 | goto out; | |
249 | } | |
250 | if (ns->ana_state == NVME_ANA_NONOPTIMIZED) | |
e398863b | 251 | found = ns; |
75c10e73 HR |
252 | } |
253 | ||
93eb0381 MW |
254 | /* |
255 | * The loop above skips the current path for round-robin semantics. | |
256 | * Fall back to the current path if either: | |
257 | * - no other optimized path found and current is optimized, | |
258 | * - no other usable path found and current is usable. | |
259 | */ | |
3f6e3246 | 260 | if (!nvme_path_is_disabled(old) && |
93eb0381 | 261 | (old->ana_state == NVME_ANA_OPTIMIZED || |
e398863b | 262 | (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) |
93eb0381 MW |
263 | return old; |
264 | ||
e398863b | 265 | if (!found) |
75c10e73 | 266 | return NULL; |
75c10e73 HR |
267 | out: |
268 | rcu_assign_pointer(head->current_path[node], found); | |
269 | return found; | |
270 | } | |
271 | ||
0d0b660f CH |
272 | static inline bool nvme_path_is_optimized(struct nvme_ns *ns) |
273 | { | |
274 | return ns->ctrl->state == NVME_CTRL_LIVE && | |
275 | ns->ana_state == NVME_ANA_OPTIMIZED; | |
32acab31 CH |
276 | } |
277 | ||
278 | inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) | |
279 | { | |
f3334447 CH |
280 | int node = numa_node_id(); |
281 | struct nvme_ns *ns; | |
32acab31 | 282 | |
f3334447 | 283 | ns = srcu_dereference(head->current_path[node], &head->srcu); |
fbd6a42d HR |
284 | if (unlikely(!ns)) |
285 | return __nvme_find_path(head, node); | |
286 | ||
287 | if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) | |
288 | return nvme_round_robin_path(head, node, ns); | |
289 | if (unlikely(!nvme_path_is_optimized(ns))) | |
290 | return __nvme_find_path(head, node); | |
32acab31 CH |
291 | return ns; |
292 | } | |
293 | ||
0157ec8d SG |
294 | static bool nvme_available_path(struct nvme_ns_head *head) |
295 | { | |
296 | struct nvme_ns *ns; | |
297 | ||
298 | list_for_each_entry_rcu(ns, &head->list, siblings) { | |
8c4dfea9 VG |
299 | if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) |
300 | continue; | |
0157ec8d SG |
301 | switch (ns->ctrl->state) { |
302 | case NVME_CTRL_LIVE: | |
303 | case NVME_CTRL_RESETTING: | |
304 | case NVME_CTRL_CONNECTING: | |
305 | /* fallthru */ | |
306 | return true; | |
307 | default: | |
308 | break; | |
309 | } | |
310 | } | |
311 | return false; | |
312 | } | |
313 | ||
1496bd49 | 314 | static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) |
32acab31 | 315 | { |
309dca30 | 316 | struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; |
32acab31 CH |
317 | struct device *dev = disk_to_dev(head->disk); |
318 | struct nvme_ns *ns; | |
319 | blk_qc_t ret = BLK_QC_T_NONE; | |
320 | int srcu_idx; | |
321 | ||
525aa5a7 | 322 | /* |
f695ca38 CH |
323 | * The namespace might be going away and the bio might be moved to a |
324 | * different queue via blk_steal_bios(), so we need to use the bio_split | |
325 | * pool from the original queue to allocate the bvecs from. | |
525aa5a7 | 326 | */ |
f695ca38 | 327 | blk_queue_split(&bio); |
525aa5a7 | 328 | |
32acab31 CH |
329 | srcu_idx = srcu_read_lock(&head->srcu); |
330 | ns = nvme_find_path(head); | |
331 | if (likely(ns)) { | |
a7c7f7b2 | 332 | bio_set_dev(bio, ns->disk->part0); |
32acab31 | 333 | bio->bi_opf |= REQ_NVME_MPATH; |
1c02fca6 | 334 | trace_block_bio_remap(bio, disk_devt(ns->head->disk), |
2796b569 | 335 | bio->bi_iter.bi_sector); |
5a6c35f9 | 336 | ret = submit_bio_noacct(bio); |
0157ec8d SG |
337 | } else if (nvme_available_path(head)) { |
338 | dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); | |
32acab31 CH |
339 | |
340 | spin_lock_irq(&head->requeue_lock); | |
341 | bio_list_add(&head->requeue_list, bio); | |
342 | spin_unlock_irq(&head->requeue_lock); | |
343 | } else { | |
0157ec8d | 344 | dev_warn_ratelimited(dev, "no available path - failing I/O\n"); |
32acab31 CH |
345 | |
346 | bio->bi_status = BLK_STS_IOERR; | |
347 | bio_endio(bio); | |
348 | } | |
349 | ||
350 | srcu_read_unlock(&head->srcu, srcu_idx); | |
351 | return ret; | |
352 | } | |
353 | ||
1496bd49 CH |
354 | static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) |
355 | { | |
356 | if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) | |
357 | return -ENXIO; | |
358 | return 0; | |
359 | } | |
360 | ||
361 | static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) | |
362 | { | |
363 | nvme_put_ns_head(disk->private_data); | |
364 | } | |
365 | ||
8b4fb0f9 CH |
366 | #ifdef CONFIG_BLK_DEV_ZONED |
367 | static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, | |
368 | unsigned int nr_zones, report_zones_cb cb, void *data) | |
369 | { | |
370 | struct nvme_ns_head *head = disk->private_data; | |
371 | struct nvme_ns *ns; | |
372 | int srcu_idx, ret = -EWOULDBLOCK; | |
373 | ||
374 | srcu_idx = srcu_read_lock(&head->srcu); | |
375 | ns = nvme_find_path(head); | |
376 | if (ns) | |
377 | ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); | |
378 | srcu_read_unlock(&head->srcu, srcu_idx); | |
379 | return ret; | |
380 | } | |
381 | #else | |
382 | #define nvme_ns_head_report_zones NULL | |
383 | #endif /* CONFIG_BLK_DEV_ZONED */ | |
384 | ||
1496bd49 CH |
385 | const struct block_device_operations nvme_ns_head_ops = { |
386 | .owner = THIS_MODULE, | |
387 | .submit_bio = nvme_ns_head_submit_bio, | |
388 | .open = nvme_ns_head_open, | |
389 | .release = nvme_ns_head_release, | |
390 | .ioctl = nvme_ns_head_ioctl, | |
391 | .getgeo = nvme_getgeo, | |
8b4fb0f9 | 392 | .report_zones = nvme_ns_head_report_zones, |
1496bd49 CH |
393 | .pr_ops = &nvme_pr_ops, |
394 | }; | |
395 | ||
2637baed MI |
396 | static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) |
397 | { | |
398 | return container_of(cdev, struct nvme_ns_head, cdev); | |
399 | } | |
400 | ||
401 | static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) | |
402 | { | |
403 | if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) | |
404 | return -ENXIO; | |
405 | return 0; | |
406 | } | |
407 | ||
408 | static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) | |
409 | { | |
410 | nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); | |
411 | return 0; | |
412 | } | |
413 | ||
414 | static const struct file_operations nvme_ns_head_chr_fops = { | |
415 | .owner = THIS_MODULE, | |
416 | .open = nvme_ns_head_chr_open, | |
417 | .release = nvme_ns_head_chr_release, | |
418 | .unlocked_ioctl = nvme_ns_head_chr_ioctl, | |
419 | .compat_ioctl = compat_ptr_ioctl, | |
420 | }; | |
421 | ||
422 | static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) | |
423 | { | |
424 | int ret; | |
425 | ||
426 | head->cdev_device.parent = &head->subsys->dev; | |
427 | ret = dev_set_name(&head->cdev_device, "ng%dn%d", | |
428 | head->subsys->instance, head->instance); | |
429 | if (ret) | |
430 | return ret; | |
431 | ret = nvme_cdev_add(&head->cdev, &head->cdev_device, | |
432 | &nvme_ns_head_chr_fops, THIS_MODULE); | |
2637baed MI |
433 | return ret; |
434 | } | |
435 | ||
32acab31 CH |
436 | static void nvme_requeue_work(struct work_struct *work) |
437 | { | |
438 | struct nvme_ns_head *head = | |
439 | container_of(work, struct nvme_ns_head, requeue_work); | |
440 | struct bio *bio, *next; | |
441 | ||
442 | spin_lock_irq(&head->requeue_lock); | |
443 | next = bio_list_get(&head->requeue_list); | |
444 | spin_unlock_irq(&head->requeue_lock); | |
445 | ||
446 | while ((bio = next) != NULL) { | |
447 | next = bio->bi_next; | |
448 | bio->bi_next = NULL; | |
449 | ||
ed00aabd | 450 | submit_bio_noacct(bio); |
32acab31 CH |
451 | } |
452 | } | |
453 | ||
454 | int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) | |
455 | { | |
32acab31 CH |
456 | bool vwc = false; |
457 | ||
0d0b660f | 458 | mutex_init(&head->lock); |
32acab31 CH |
459 | bio_list_init(&head->requeue_list); |
460 | spin_lock_init(&head->requeue_lock); | |
461 | INIT_WORK(&head->requeue_work, nvme_requeue_work); | |
462 | ||
463 | /* | |
464 | * Add a multipath node if the subsystems supports multiple controllers. | |
58fd3632 SM |
465 | * We also do this for private namespaces as the namespace sharing flag |
466 | * could change after a rescan. | |
32acab31 | 467 | */ |
58fd3632 SM |
468 | if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || |
469 | !nvme_is_unique_nsid(ctrl, head) || !multipath) | |
32acab31 CH |
470 | return 0; |
471 | ||
f165fb89 | 472 | head->disk = blk_alloc_disk(ctrl->numa_node); |
32acab31 | 473 | if (!head->disk) |
f165fb89 | 474 | return -ENOMEM; |
32acab31 CH |
475 | head->disk->fops = &nvme_ns_head_ops; |
476 | head->disk->private_data = head; | |
32acab31 CH |
477 | sprintf(head->disk->disk_name, "nvme%dn%d", |
478 | ctrl->subsys->instance, head->instance); | |
32acab31 | 479 | |
f165fb89 | 480 | blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); |
d32d3d0b CH |
481 | blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); |
482 | ||
f165fb89 CH |
483 | /* set to a default value of 512 until the disk is validated */ |
484 | blk_queue_logical_block_size(head->disk->queue, 512); | |
485 | blk_set_stacking_limits(&head->disk->queue->limits); | |
486 | ||
487 | /* we need to propagate up the VMC settings */ | |
488 | if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) | |
489 | vwc = true; | |
490 | blk_queue_write_cache(head->disk->queue, vwc, vwc); | |
491 | return 0; | |
32acab31 CH |
492 | } |
493 | ||
0d0b660f | 494 | static void nvme_mpath_set_live(struct nvme_ns *ns) |
32acab31 | 495 | { |
0d0b660f CH |
496 | struct nvme_ns_head *head = ns->head; |
497 | ||
32acab31 CH |
498 | if (!head->disk) |
499 | return; | |
9bd82b1a | 500 | |
2637baed | 501 | if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
33b14f67 HR |
502 | device_add_disk(&head->subsys->dev, head->disk, |
503 | nvme_ns_id_attr_groups); | |
2637baed MI |
504 | nvme_add_ns_head_cdev(head); |
505 | } | |
0d0b660f | 506 | |
d8a22f85 | 507 | mutex_lock(&head->lock); |
886fabf6 KB |
508 | if (nvme_path_is_optimized(ns)) { |
509 | int node, srcu_idx; | |
510 | ||
511 | srcu_idx = srcu_read_lock(&head->srcu); | |
512 | for_each_node(node) | |
513 | __nvme_find_path(head, node); | |
514 | srcu_read_unlock(&head->srcu, srcu_idx); | |
515 | } | |
e164471d | 516 | mutex_unlock(&head->lock); |
886fabf6 | 517 | |
e164471d SG |
518 | synchronize_srcu(&head->srcu); |
519 | kblockd_schedule_work(&head->requeue_work); | |
0d0b660f CH |
520 | } |
521 | ||
522 | static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, | |
523 | int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, | |
524 | void *)) | |
525 | { | |
526 | void *base = ctrl->ana_log_buf; | |
527 | size_t offset = sizeof(struct nvme_ana_rsp_hdr); | |
528 | int error, i; | |
529 | ||
530 | lockdep_assert_held(&ctrl->ana_lock); | |
531 | ||
532 | for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { | |
533 | struct nvme_ana_group_desc *desc = base + offset; | |
64fab729 PS |
534 | u32 nr_nsids; |
535 | size_t nsid_buf_size; | |
536 | ||
537 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) | |
538 | return -EINVAL; | |
539 | ||
540 | nr_nsids = le32_to_cpu(desc->nnsids); | |
541 | nsid_buf_size = nr_nsids * sizeof(__le32); | |
0d0b660f CH |
542 | |
543 | if (WARN_ON_ONCE(desc->grpid == 0)) | |
544 | return -EINVAL; | |
545 | if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) | |
546 | return -EINVAL; | |
547 | if (WARN_ON_ONCE(desc->state == 0)) | |
548 | return -EINVAL; | |
549 | if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) | |
550 | return -EINVAL; | |
551 | ||
552 | offset += sizeof(*desc); | |
553 | if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) | |
554 | return -EINVAL; | |
555 | ||
556 | error = cb(ctrl, desc, data); | |
557 | if (error) | |
558 | return error; | |
559 | ||
560 | offset += nsid_buf_size; | |
0d0b660f CH |
561 | } |
562 | ||
563 | return 0; | |
564 | } | |
565 | ||
566 | static inline bool nvme_state_is_live(enum nvme_ana_state state) | |
567 | { | |
568 | return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; | |
569 | } | |
570 | ||
571 | static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, | |
572 | struct nvme_ns *ns) | |
573 | { | |
0d0b660f CH |
574 | ns->ana_grpid = le32_to_cpu(desc->grpid); |
575 | ns->ana_state = desc->state; | |
576 | clear_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
577 | ||
cc2278c4 | 578 | if (nvme_state_is_live(ns->ana_state)) |
0d0b660f | 579 | nvme_mpath_set_live(ns); |
0d0b660f CH |
580 | } |
581 | ||
582 | static int nvme_update_ana_state(struct nvme_ctrl *ctrl, | |
583 | struct nvme_ana_group_desc *desc, void *data) | |
584 | { | |
585 | u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; | |
586 | unsigned *nr_change_groups = data; | |
587 | struct nvme_ns *ns; | |
588 | ||
592b6e7b | 589 | dev_dbg(ctrl->device, "ANA group %d: %s.\n", |
0d0b660f CH |
590 | le32_to_cpu(desc->grpid), |
591 | nvme_ana_state_names[desc->state]); | |
592 | ||
593 | if (desc->state == NVME_ANA_CHANGE) | |
594 | (*nr_change_groups)++; | |
595 | ||
596 | if (!nr_nsids) | |
597 | return 0; | |
598 | ||
657f1975 | 599 | down_read(&ctrl->namespaces_rwsem); |
0d0b660f | 600 | list_for_each_entry(ns, &ctrl->namespaces, list) { |
79f528af AE |
601 | unsigned nsid; |
602 | again: | |
603 | nsid = le32_to_cpu(desc->nsids[n]); | |
e01f91df | 604 | if (ns->head->ns_id < nsid) |
0d0b660f | 605 | continue; |
e01f91df AE |
606 | if (ns->head->ns_id == nsid) |
607 | nvme_update_ns_ana_state(desc, ns); | |
0d0b660f CH |
608 | if (++n == nr_nsids) |
609 | break; | |
79f528af AE |
610 | if (ns->head->ns_id > nsid) |
611 | goto again; | |
0d0b660f | 612 | } |
657f1975 | 613 | up_read(&ctrl->namespaces_rwsem); |
0d0b660f CH |
614 | return 0; |
615 | } | |
616 | ||
86cccfbf | 617 | static int nvme_read_ana_log(struct nvme_ctrl *ctrl) |
0d0b660f CH |
618 | { |
619 | u32 nr_change_groups = 0; | |
620 | int error; | |
621 | ||
622 | mutex_lock(&ctrl->ana_lock); | |
be93e87e | 623 | error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, |
0d0b660f CH |
624 | ctrl->ana_log_buf, ctrl->ana_log_size, 0); |
625 | if (error) { | |
626 | dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); | |
627 | goto out_unlock; | |
628 | } | |
629 | ||
630 | error = nvme_parse_ana_log(ctrl, &nr_change_groups, | |
631 | nvme_update_ana_state); | |
632 | if (error) | |
633 | goto out_unlock; | |
634 | ||
635 | /* | |
636 | * In theory we should have an ANATT timer per group as they might enter | |
637 | * the change state at different times. But that is a lot of overhead | |
638 | * just to protect against a target that keeps entering new changes | |
639 | * states while never finishing previous ones. But we'll still | |
640 | * eventually time out once all groups are in change state, so this | |
641 | * isn't a big deal. | |
642 | * | |
643 | * We also double the ANATT value to provide some slack for transports | |
644 | * or AEN processing overhead. | |
645 | */ | |
646 | if (nr_change_groups) | |
647 | mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); | |
648 | else | |
649 | del_timer_sync(&ctrl->anatt_timer); | |
650 | out_unlock: | |
651 | mutex_unlock(&ctrl->ana_lock); | |
652 | return error; | |
653 | } | |
654 | ||
655 | static void nvme_ana_work(struct work_struct *work) | |
656 | { | |
657 | struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); | |
658 | ||
ecca390e SG |
659 | if (ctrl->state != NVME_CTRL_LIVE) |
660 | return; | |
661 | ||
86cccfbf | 662 | nvme_read_ana_log(ctrl); |
0d0b660f CH |
663 | } |
664 | ||
665 | static void nvme_anatt_timeout(struct timer_list *t) | |
666 | { | |
667 | struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); | |
668 | ||
669 | dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); | |
670 | nvme_reset_ctrl(ctrl); | |
671 | } | |
672 | ||
673 | void nvme_mpath_stop(struct nvme_ctrl *ctrl) | |
674 | { | |
675 | if (!nvme_ctrl_use_ana(ctrl)) | |
676 | return; | |
677 | del_timer_sync(&ctrl->anatt_timer); | |
678 | cancel_work_sync(&ctrl->ana_work); | |
679 | } | |
680 | ||
75c10e73 HR |
681 | #define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ |
682 | struct device_attribute subsys_attr_##_name = \ | |
683 | __ATTR(_name, _mode, _show, _store) | |
684 | ||
685 | static const char *nvme_iopolicy_names[] = { | |
686 | [NVME_IOPOLICY_NUMA] = "numa", | |
687 | [NVME_IOPOLICY_RR] = "round-robin", | |
688 | }; | |
689 | ||
690 | static ssize_t nvme_subsys_iopolicy_show(struct device *dev, | |
691 | struct device_attribute *attr, char *buf) | |
692 | { | |
693 | struct nvme_subsystem *subsys = | |
694 | container_of(dev, struct nvme_subsystem, dev); | |
695 | ||
bff4bcf3 DW |
696 | return sysfs_emit(buf, "%s\n", |
697 | nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); | |
75c10e73 HR |
698 | } |
699 | ||
700 | static ssize_t nvme_subsys_iopolicy_store(struct device *dev, | |
701 | struct device_attribute *attr, const char *buf, size_t count) | |
702 | { | |
703 | struct nvme_subsystem *subsys = | |
704 | container_of(dev, struct nvme_subsystem, dev); | |
705 | int i; | |
706 | ||
707 | for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { | |
708 | if (sysfs_streq(buf, nvme_iopolicy_names[i])) { | |
709 | WRITE_ONCE(subsys->iopolicy, i); | |
710 | return count; | |
711 | } | |
712 | } | |
713 | ||
714 | return -EINVAL; | |
715 | } | |
716 | SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, | |
717 | nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); | |
718 | ||
0d0b660f CH |
719 | static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, |
720 | char *buf) | |
721 | { | |
bff4bcf3 | 722 | return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); |
0d0b660f CH |
723 | } |
724 | DEVICE_ATTR_RO(ana_grpid); | |
725 | ||
726 | static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, | |
727 | char *buf) | |
728 | { | |
729 | struct nvme_ns *ns = nvme_get_ns_from_dev(dev); | |
730 | ||
bff4bcf3 | 731 | return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); |
0d0b660f CH |
732 | } |
733 | DEVICE_ATTR_RO(ana_state); | |
734 | ||
489dd102 | 735 | static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, |
0d0b660f CH |
736 | struct nvme_ana_group_desc *desc, void *data) |
737 | { | |
489dd102 | 738 | struct nvme_ana_group_desc *dst = data; |
0d0b660f | 739 | |
489dd102 AE |
740 | if (desc->grpid != dst->grpid) |
741 | return 0; | |
0d0b660f | 742 | |
489dd102 AE |
743 | *dst = *desc; |
744 | return -ENXIO; /* just break out of the loop */ | |
0d0b660f CH |
745 | } |
746 | ||
747 | void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) | |
748 | { | |
749 | if (nvme_ctrl_use_ana(ns->ctrl)) { | |
489dd102 AE |
750 | struct nvme_ana_group_desc desc = { |
751 | .grpid = id->anagrpid, | |
752 | .state = 0, | |
753 | }; | |
754 | ||
0d0b660f CH |
755 | mutex_lock(&ns->ctrl->ana_lock); |
756 | ns->ana_grpid = le32_to_cpu(id->anagrpid); | |
489dd102 | 757 | nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); |
0d0b660f | 758 | mutex_unlock(&ns->ctrl->ana_lock); |
489dd102 AE |
759 | if (desc.state) { |
760 | /* found the group desc: update */ | |
761 | nvme_update_ns_ana_state(&desc, ns); | |
dd8f7fa9 HR |
762 | } else { |
763 | /* group desc not found: trigger a re-read */ | |
764 | set_bit(NVME_NS_ANA_PENDING, &ns->flags); | |
765 | queue_work(nvme_wq, &ns->ctrl->ana_work); | |
489dd102 | 766 | } |
0d0b660f | 767 | } else { |
e234f1f8 | 768 | ns->ana_state = NVME_ANA_OPTIMIZED; |
0d0b660f | 769 | nvme_mpath_set_live(ns); |
9bd82b1a | 770 | } |
b2ce4d90 | 771 | |
1cb039f3 CH |
772 | if (blk_queue_stable_writes(ns->queue) && ns->head->disk) |
773 | blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, | |
774 | ns->head->disk->queue); | |
73a1a229 KB |
775 | #ifdef CONFIG_BLK_DEV_ZONED |
776 | if (blk_queue_is_zoned(ns->queue) && ns->head->disk) | |
777 | ns->head->disk->queue->nr_zones = ns->queue->nr_zones; | |
778 | #endif | |
32acab31 CH |
779 | } |
780 | ||
5396fdac | 781 | void nvme_mpath_shutdown_disk(struct nvme_ns_head *head) |
32acab31 CH |
782 | { |
783 | if (!head->disk) | |
784 | return; | |
5396fdac | 785 | kblockd_schedule_work(&head->requeue_work); |
916a470d | 786 | if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { |
2637baed | 787 | nvme_cdev_del(&head->cdev, &head->cdev_device); |
0d0b660f | 788 | del_gendisk(head->disk); |
2637baed | 789 | } |
5396fdac HR |
790 | } |
791 | ||
792 | void nvme_mpath_remove_disk(struct nvme_ns_head *head) | |
793 | { | |
794 | if (!head->disk) | |
795 | return; | |
b142c5ae | 796 | blk_mark_disk_dead(head->disk); |
32acab31 CH |
797 | /* make sure all pending bios are cleaned up */ |
798 | kblockd_schedule_work(&head->requeue_work); | |
799 | flush_work(&head->requeue_work); | |
f165fb89 | 800 | blk_cleanup_disk(head->disk); |
32acab31 | 801 | } |
0d0b660f | 802 | |
5e1f6899 | 803 | void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) |
0d0b660f | 804 | { |
5e1f6899 CH |
805 | mutex_init(&ctrl->ana_lock); |
806 | timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); | |
807 | INIT_WORK(&ctrl->ana_work, nvme_ana_work); | |
808 | } | |
809 | ||
810 | int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) | |
811 | { | |
812 | size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; | |
813 | size_t ana_log_size; | |
814 | int error = 0; | |
0d0b660f | 815 | |
66b20ac0 | 816 | /* check if multipath is enabled and we have the capability */ |
92decf11 KB |
817 | if (!multipath || !ctrl->subsys || |
818 | !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) | |
0d0b660f CH |
819 | return 0; |
820 | ||
120bb362 DW |
821 | if (!ctrl->max_namespaces || |
822 | ctrl->max_namespaces > le32_to_cpu(id->nn)) { | |
823 | dev_err(ctrl->device, | |
824 | "Invalid MNAN value %u\n", ctrl->max_namespaces); | |
825 | return -EINVAL; | |
826 | } | |
827 | ||
0d0b660f CH |
828 | ctrl->anacap = id->anacap; |
829 | ctrl->anatt = id->anatt; | |
830 | ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); | |
831 | ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); | |
832 | ||
5e1f6899 CH |
833 | ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + |
834 | ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + | |
835 | ctrl->max_namespaces * sizeof(__le32); | |
836 | if (ana_log_size > max_transfer_size) { | |
0d0b660f | 837 | dev_err(ctrl->device, |
5e1f6899 CH |
838 | "ANA log page size (%zd) larger than MDTS (%zd).\n", |
839 | ana_log_size, max_transfer_size); | |
0d0b660f | 840 | dev_err(ctrl->device, "disabling ANA support.\n"); |
5e1f6899 | 841 | goto out_uninit; |
0d0b660f | 842 | } |
5e1f6899 CH |
843 | if (ana_log_size > ctrl->ana_log_size) { |
844 | nvme_mpath_stop(ctrl); | |
845 | kfree(ctrl->ana_log_buf); | |
e181811b | 846 | ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); |
5e1f6899 CH |
847 | if (!ctrl->ana_log_buf) |
848 | return -ENOMEM; | |
bb830add | 849 | } |
5e1f6899 | 850 | ctrl->ana_log_size = ana_log_size; |
86cccfbf | 851 | error = nvme_read_ana_log(ctrl); |
0d0b660f | 852 | if (error) |
5e1f6899 | 853 | goto out_uninit; |
0d0b660f | 854 | return 0; |
5e1f6899 CH |
855 | |
856 | out_uninit: | |
857 | nvme_mpath_uninit(ctrl); | |
bb830add | 858 | return error; |
0d0b660f CH |
859 | } |
860 | ||
861 | void nvme_mpath_uninit(struct nvme_ctrl *ctrl) | |
862 | { | |
863 | kfree(ctrl->ana_log_buf); | |
c7055fd1 | 864 | ctrl->ana_log_buf = NULL; |
0d0b660f | 865 | } |