]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/nvme/host/core.c
nvme-pci: Sync queues on reset
[mirror_ubuntu-bionic-kernel.git] / drivers / nvme / host / core.c
1 /*
2 * NVM Express device driver
3 * Copyright (c) 2011-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15 #include <linux/blkdev.h>
16 #include <linux/blk-mq.h>
17 #include <linux/delay.h>
18 #include <linux/errno.h>
19 #include <linux/hdreg.h>
20 #include <linux/kernel.h>
21 #include <linux/module.h>
22 #include <linux/list_sort.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/pr.h>
26 #include <linux/ptrace.h>
27 #include <linux/nvme_ioctl.h>
28 #include <linux/t10-pi.h>
29 #include <linux/pm_qos.h>
30 #include <asm/unaligned.h>
31
32 #include "nvme.h"
33 #include "fabrics.h"
34
35 #define NVME_MINORS (1U << MINORBITS)
36
37 unsigned int admin_timeout = 60;
38 module_param(admin_timeout, uint, 0644);
39 MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
40 EXPORT_SYMBOL_GPL(admin_timeout);
41
42 unsigned int nvme_io_timeout = 30;
43 module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
44 MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
45 EXPORT_SYMBOL_GPL(nvme_io_timeout);
46
47 static unsigned char shutdown_timeout = 5;
48 module_param(shutdown_timeout, byte, 0644);
49 MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
50
51 static u8 nvme_max_retries = 5;
52 module_param_named(max_retries, nvme_max_retries, byte, 0644);
53 MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
54
55 static unsigned long default_ps_max_latency_us = 100000;
56 module_param(default_ps_max_latency_us, ulong, 0644);
57 MODULE_PARM_DESC(default_ps_max_latency_us,
58 "max power saving latency for new devices; use PM QOS to change per device");
59
60 static bool force_apst;
61 module_param(force_apst, bool, 0644);
62 MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
63
64 static bool streams;
65 module_param(streams, bool, 0644);
66 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
67
68 struct workqueue_struct *nvme_wq;
69 EXPORT_SYMBOL_GPL(nvme_wq);
70
71 static DEFINE_IDA(nvme_subsystems_ida);
72 static LIST_HEAD(nvme_subsystems);
73 static DEFINE_MUTEX(nvme_subsystems_lock);
74
75 static DEFINE_IDA(nvme_instance_ida);
76 static dev_t nvme_chr_devt;
77 static struct class *nvme_class;
78 static struct class *nvme_subsys_class;
79
80 static void nvme_ns_remove(struct nvme_ns *ns);
81 static int nvme_revalidate_disk(struct gendisk *disk);
82 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
83
84 static __le32 nvme_get_log_dw10(u8 lid, size_t size)
85 {
86 return cpu_to_le32((((size / 4) - 1) << 16) | lid);
87 }
88
89 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
90 {
91 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
92 return -EBUSY;
93 if (!queue_work(nvme_wq, &ctrl->reset_work))
94 return -EBUSY;
95 return 0;
96 }
97 EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
98
99 static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
100 {
101 int ret;
102
103 ret = nvme_reset_ctrl(ctrl);
104 if (!ret)
105 flush_work(&ctrl->reset_work);
106 return ret;
107 }
108
109 static void nvme_delete_ctrl_work(struct work_struct *work)
110 {
111 struct nvme_ctrl *ctrl =
112 container_of(work, struct nvme_ctrl, delete_work);
113
114 flush_work(&ctrl->reset_work);
115 nvme_stop_ctrl(ctrl);
116 nvme_remove_namespaces(ctrl);
117 ctrl->ops->delete_ctrl(ctrl);
118 nvme_uninit_ctrl(ctrl);
119 nvme_put_ctrl(ctrl);
120 }
121
122 int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
123 {
124 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
125 return -EBUSY;
126 if (!queue_work(nvme_wq, &ctrl->delete_work))
127 return -EBUSY;
128 return 0;
129 }
130 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
131
132 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
133 {
134 int ret = 0;
135
136 /*
137 * Keep a reference until the work is flushed since ->delete_ctrl
138 * can free the controller.
139 */
140 nvme_get_ctrl(ctrl);
141 ret = nvme_delete_ctrl(ctrl);
142 if (!ret)
143 flush_work(&ctrl->delete_work);
144 nvme_put_ctrl(ctrl);
145 return ret;
146 }
147 EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
148
149 static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
150 {
151 return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
152 }
153
154 static blk_status_t nvme_error_status(struct request *req)
155 {
156 switch (nvme_req(req)->status & 0x7ff) {
157 case NVME_SC_SUCCESS:
158 return BLK_STS_OK;
159 case NVME_SC_CAP_EXCEEDED:
160 return BLK_STS_NOSPC;
161 case NVME_SC_ONCS_NOT_SUPPORTED:
162 return BLK_STS_NOTSUPP;
163 case NVME_SC_WRITE_FAULT:
164 case NVME_SC_READ_ERROR:
165 case NVME_SC_UNWRITTEN_BLOCK:
166 case NVME_SC_ACCESS_DENIED:
167 case NVME_SC_READ_ONLY:
168 return BLK_STS_MEDIUM;
169 case NVME_SC_GUARD_CHECK:
170 case NVME_SC_APPTAG_CHECK:
171 case NVME_SC_REFTAG_CHECK:
172 case NVME_SC_INVALID_PI:
173 return BLK_STS_PROTECTION;
174 case NVME_SC_RESERVATION_CONFLICT:
175 return BLK_STS_NEXUS;
176 default:
177 return BLK_STS_IOERR;
178 }
179 }
180
181 static inline bool nvme_req_needs_retry(struct request *req)
182 {
183 if (blk_noretry_request(req))
184 return false;
185 if (nvme_req(req)->status & NVME_SC_DNR)
186 return false;
187 if (nvme_req(req)->retries >= nvme_max_retries)
188 return false;
189 return true;
190 }
191
192 void nvme_complete_rq(struct request *req)
193 {
194 if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
195 if (nvme_req_needs_failover(req)) {
196 nvme_failover_req(req);
197 return;
198 }
199
200 if (!blk_queue_dying(req->q)) {
201 nvme_req(req)->retries++;
202 blk_mq_requeue_request(req, true);
203 return;
204 }
205 }
206
207 blk_mq_end_request(req, nvme_error_status(req));
208 }
209 EXPORT_SYMBOL_GPL(nvme_complete_rq);
210
211 void nvme_cancel_request(struct request *req, void *data, bool reserved)
212 {
213 if (!blk_mq_request_started(req))
214 return;
215
216 dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
217 "Cancelling I/O %d", req->tag);
218
219 nvme_req(req)->status = NVME_SC_ABORT_REQ;
220 blk_mq_complete_request(req);
221
222 }
223 EXPORT_SYMBOL_GPL(nvme_cancel_request);
224
225 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
226 enum nvme_ctrl_state new_state)
227 {
228 enum nvme_ctrl_state old_state;
229 unsigned long flags;
230 bool changed = false;
231
232 spin_lock_irqsave(&ctrl->lock, flags);
233
234 old_state = ctrl->state;
235 switch (new_state) {
236 case NVME_CTRL_LIVE:
237 switch (old_state) {
238 case NVME_CTRL_NEW:
239 case NVME_CTRL_RESETTING:
240 case NVME_CTRL_RECONNECTING:
241 changed = true;
242 /* FALLTHRU */
243 default:
244 break;
245 }
246 break;
247 case NVME_CTRL_RESETTING:
248 switch (old_state) {
249 case NVME_CTRL_NEW:
250 case NVME_CTRL_LIVE:
251 changed = true;
252 /* FALLTHRU */
253 default:
254 break;
255 }
256 break;
257 case NVME_CTRL_RECONNECTING:
258 switch (old_state) {
259 case NVME_CTRL_LIVE:
260 case NVME_CTRL_RESETTING:
261 changed = true;
262 /* FALLTHRU */
263 default:
264 break;
265 }
266 break;
267 case NVME_CTRL_DELETING:
268 switch (old_state) {
269 case NVME_CTRL_LIVE:
270 case NVME_CTRL_RESETTING:
271 case NVME_CTRL_RECONNECTING:
272 changed = true;
273 /* FALLTHRU */
274 default:
275 break;
276 }
277 break;
278 case NVME_CTRL_DEAD:
279 switch (old_state) {
280 case NVME_CTRL_DELETING:
281 changed = true;
282 /* FALLTHRU */
283 default:
284 break;
285 }
286 break;
287 default:
288 break;
289 }
290
291 if (changed)
292 ctrl->state = new_state;
293
294 spin_unlock_irqrestore(&ctrl->lock, flags);
295 if (changed && ctrl->state == NVME_CTRL_LIVE)
296 nvme_kick_requeue_lists(ctrl);
297 return changed;
298 }
299 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
300
301 static void nvme_free_ns_head(struct kref *ref)
302 {
303 struct nvme_ns_head *head =
304 container_of(ref, struct nvme_ns_head, ref);
305
306 nvme_mpath_remove_disk(head);
307 ida_simple_remove(&head->subsys->ns_ida, head->instance);
308 list_del_init(&head->entry);
309 cleanup_srcu_struct(&head->srcu);
310 nvme_put_subsystem(head->subsys);
311 kfree(head);
312 }
313
314 static void nvme_put_ns_head(struct nvme_ns_head *head)
315 {
316 kref_put(&head->ref, nvme_free_ns_head);
317 }
318
319 static void nvme_free_ns(struct kref *kref)
320 {
321 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
322
323 if (ns->ndev)
324 nvme_nvm_unregister(ns);
325
326 put_disk(ns->disk);
327 nvme_put_ns_head(ns->head);
328 nvme_put_ctrl(ns->ctrl);
329 kfree(ns);
330 }
331
332 static void nvme_put_ns(struct nvme_ns *ns)
333 {
334 kref_put(&ns->kref, nvme_free_ns);
335 }
336
337 struct request *nvme_alloc_request(struct request_queue *q,
338 struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
339 {
340 unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
341 struct request *req;
342
343 if (qid == NVME_QID_ANY) {
344 req = blk_mq_alloc_request(q, op, flags);
345 } else {
346 req = blk_mq_alloc_request_hctx(q, op, flags,
347 qid ? qid - 1 : 0);
348 }
349 if (IS_ERR(req))
350 return req;
351
352 req->cmd_flags |= REQ_FAILFAST_DRIVER;
353 nvme_req(req)->cmd = cmd;
354
355 return req;
356 }
357 EXPORT_SYMBOL_GPL(nvme_alloc_request);
358
359 static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
360 {
361 struct nvme_command c;
362
363 memset(&c, 0, sizeof(c));
364
365 c.directive.opcode = nvme_admin_directive_send;
366 c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
367 c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
368 c.directive.dtype = NVME_DIR_IDENTIFY;
369 c.directive.tdtype = NVME_DIR_STREAMS;
370 c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
371
372 return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
373 }
374
375 static int nvme_disable_streams(struct nvme_ctrl *ctrl)
376 {
377 return nvme_toggle_streams(ctrl, false);
378 }
379
380 static int nvme_enable_streams(struct nvme_ctrl *ctrl)
381 {
382 return nvme_toggle_streams(ctrl, true);
383 }
384
385 static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
386 struct streams_directive_params *s, u32 nsid)
387 {
388 struct nvme_command c;
389
390 memset(&c, 0, sizeof(c));
391 memset(s, 0, sizeof(*s));
392
393 c.directive.opcode = nvme_admin_directive_recv;
394 c.directive.nsid = cpu_to_le32(nsid);
395 c.directive.numd = cpu_to_le32((sizeof(*s) >> 2) - 1);
396 c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
397 c.directive.dtype = NVME_DIR_STREAMS;
398
399 return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
400 }
401
402 static int nvme_configure_directives(struct nvme_ctrl *ctrl)
403 {
404 struct streams_directive_params s;
405 int ret;
406
407 if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
408 return 0;
409 if (!streams)
410 return 0;
411
412 ret = nvme_enable_streams(ctrl);
413 if (ret)
414 return ret;
415
416 ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
417 if (ret)
418 return ret;
419
420 ctrl->nssa = le16_to_cpu(s.nssa);
421 if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
422 dev_info(ctrl->device, "too few streams (%u) available\n",
423 ctrl->nssa);
424 nvme_disable_streams(ctrl);
425 return 0;
426 }
427
428 ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
429 dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
430 return 0;
431 }
432
433 /*
434 * Check if 'req' has a write hint associated with it. If it does, assign
435 * a valid namespace stream to the write.
436 */
437 static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
438 struct request *req, u16 *control,
439 u32 *dsmgmt)
440 {
441 enum rw_hint streamid = req->write_hint;
442
443 if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
444 streamid = 0;
445 else {
446 streamid--;
447 if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
448 return;
449
450 *control |= NVME_RW_DTYPE_STREAMS;
451 *dsmgmt |= streamid << 16;
452 }
453
454 if (streamid < ARRAY_SIZE(req->q->write_hints))
455 req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
456 }
457
458 static inline void nvme_setup_flush(struct nvme_ns *ns,
459 struct nvme_command *cmnd)
460 {
461 memset(cmnd, 0, sizeof(*cmnd));
462 cmnd->common.opcode = nvme_cmd_flush;
463 cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
464 }
465
466 static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
467 struct nvme_command *cmnd)
468 {
469 unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
470 struct nvme_dsm_range *range;
471 struct bio *bio;
472
473 range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
474 if (!range)
475 return BLK_STS_RESOURCE;
476
477 __rq_for_each_bio(bio, req) {
478 u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
479 u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
480
481 range[n].cattr = cpu_to_le32(0);
482 range[n].nlb = cpu_to_le32(nlb);
483 range[n].slba = cpu_to_le64(slba);
484 n++;
485 }
486
487 if (WARN_ON_ONCE(n != segments)) {
488 kfree(range);
489 return BLK_STS_IOERR;
490 }
491
492 memset(cmnd, 0, sizeof(*cmnd));
493 cmnd->dsm.opcode = nvme_cmd_dsm;
494 cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
495 cmnd->dsm.nr = cpu_to_le32(segments - 1);
496 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
497
498 req->special_vec.bv_page = virt_to_page(range);
499 req->special_vec.bv_offset = offset_in_page(range);
500 req->special_vec.bv_len = sizeof(*range) * segments;
501 req->rq_flags |= RQF_SPECIAL_PAYLOAD;
502
503 return BLK_STS_OK;
504 }
505
506 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
507 struct request *req, struct nvme_command *cmnd)
508 {
509 struct nvme_ctrl *ctrl = ns->ctrl;
510 u16 control = 0;
511 u32 dsmgmt = 0;
512
513 if (req->cmd_flags & REQ_FUA)
514 control |= NVME_RW_FUA;
515 if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
516 control |= NVME_RW_LR;
517
518 if (req->cmd_flags & REQ_RAHEAD)
519 dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
520
521 memset(cmnd, 0, sizeof(*cmnd));
522 cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
523 cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
524 cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
525 cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
526
527 if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
528 nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
529
530 if (ns->ms) {
531 /*
532 * If formated with metadata, the block layer always provides a
533 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
534 * we enable the PRACT bit for protection information or set the
535 * namespace capacity to zero to prevent any I/O.
536 */
537 if (!blk_integrity_rq(req)) {
538 if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
539 return BLK_STS_NOTSUPP;
540 control |= NVME_RW_PRINFO_PRACT;
541 }
542
543 switch (ns->pi_type) {
544 case NVME_NS_DPS_PI_TYPE3:
545 control |= NVME_RW_PRINFO_PRCHK_GUARD;
546 break;
547 case NVME_NS_DPS_PI_TYPE1:
548 case NVME_NS_DPS_PI_TYPE2:
549 control |= NVME_RW_PRINFO_PRCHK_GUARD |
550 NVME_RW_PRINFO_PRCHK_REF;
551 cmnd->rw.reftag = cpu_to_le32(
552 nvme_block_nr(ns, blk_rq_pos(req)));
553 break;
554 }
555 }
556
557 cmnd->rw.control = cpu_to_le16(control);
558 cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
559 return 0;
560 }
561
562 blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
563 struct nvme_command *cmd)
564 {
565 blk_status_t ret = BLK_STS_OK;
566
567 if (!(req->rq_flags & RQF_DONTPREP)) {
568 nvme_req(req)->retries = 0;
569 nvme_req(req)->flags = 0;
570 req->rq_flags |= RQF_DONTPREP;
571 }
572
573 switch (req_op(req)) {
574 case REQ_OP_DRV_IN:
575 case REQ_OP_DRV_OUT:
576 memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
577 break;
578 case REQ_OP_FLUSH:
579 nvme_setup_flush(ns, cmd);
580 break;
581 case REQ_OP_WRITE_ZEROES:
582 /* currently only aliased to deallocate for a few ctrls: */
583 case REQ_OP_DISCARD:
584 ret = nvme_setup_discard(ns, req, cmd);
585 break;
586 case REQ_OP_READ:
587 case REQ_OP_WRITE:
588 ret = nvme_setup_rw(ns, req, cmd);
589 break;
590 default:
591 WARN_ON_ONCE(1);
592 return BLK_STS_IOERR;
593 }
594
595 cmd->common.command_id = req->tag;
596 return ret;
597 }
598 EXPORT_SYMBOL_GPL(nvme_setup_cmd);
599
600 /*
601 * Returns 0 on success. If the result is negative, it's a Linux error code;
602 * if the result is positive, it's an NVM Express status code
603 */
604 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
605 union nvme_result *result, void *buffer, unsigned bufflen,
606 unsigned timeout, int qid, int at_head,
607 blk_mq_req_flags_t flags)
608 {
609 struct request *req;
610 int ret;
611
612 req = nvme_alloc_request(q, cmd, flags, qid);
613 if (IS_ERR(req))
614 return PTR_ERR(req);
615
616 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
617
618 if (buffer && bufflen) {
619 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
620 if (ret)
621 goto out;
622 }
623
624 blk_execute_rq(req->q, NULL, req, at_head);
625 if (result)
626 *result = nvme_req(req)->result;
627 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
628 ret = -EINTR;
629 else
630 ret = nvme_req(req)->status;
631 out:
632 blk_mq_free_request(req);
633 return ret;
634 }
635 EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
636
637 int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
638 void *buffer, unsigned bufflen)
639 {
640 return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
641 NVME_QID_ANY, 0, 0);
642 }
643 EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
644
645 static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
646 unsigned len, u32 seed, bool write)
647 {
648 struct bio_integrity_payload *bip;
649 int ret = -ENOMEM;
650 void *buf;
651
652 buf = kmalloc(len, GFP_KERNEL);
653 if (!buf)
654 goto out;
655
656 ret = -EFAULT;
657 if (write && copy_from_user(buf, ubuf, len))
658 goto out_free_meta;
659
660 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
661 if (IS_ERR(bip)) {
662 ret = PTR_ERR(bip);
663 goto out_free_meta;
664 }
665
666 bip->bip_iter.bi_size = len;
667 bip->bip_iter.bi_sector = seed;
668 ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
669 offset_in_page(buf));
670 if (ret == len)
671 return buf;
672 ret = -ENOMEM;
673 out_free_meta:
674 kfree(buf);
675 out:
676 return ERR_PTR(ret);
677 }
678
679 static int nvme_submit_user_cmd(struct request_queue *q,
680 struct nvme_command *cmd, void __user *ubuffer,
681 unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
682 u32 meta_seed, u32 *result, unsigned timeout)
683 {
684 bool write = nvme_is_write(cmd);
685 struct nvme_ns *ns = q->queuedata;
686 struct gendisk *disk = ns ? ns->disk : NULL;
687 struct request *req;
688 struct bio *bio = NULL;
689 void *meta = NULL;
690 int ret;
691
692 req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY);
693 if (IS_ERR(req))
694 return PTR_ERR(req);
695
696 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
697
698 if (ubuffer && bufflen) {
699 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
700 GFP_KERNEL);
701 if (ret)
702 goto out;
703 bio = req->bio;
704 bio->bi_disk = disk;
705 if (disk && meta_buffer && meta_len) {
706 meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
707 meta_seed, write);
708 if (IS_ERR(meta)) {
709 ret = PTR_ERR(meta);
710 goto out_unmap;
711 }
712 req->cmd_flags |= REQ_INTEGRITY;
713 }
714 }
715
716 blk_execute_rq(req->q, disk, req, 0);
717 if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
718 ret = -EINTR;
719 else
720 ret = nvme_req(req)->status;
721 if (result)
722 *result = le32_to_cpu(nvme_req(req)->result.u32);
723 if (meta && !ret && !write) {
724 if (copy_to_user(meta_buffer, meta, meta_len))
725 ret = -EFAULT;
726 }
727 kfree(meta);
728 out_unmap:
729 if (bio)
730 blk_rq_unmap_user(bio);
731 out:
732 blk_mq_free_request(req);
733 return ret;
734 }
735
736 static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
737 {
738 struct nvme_ctrl *ctrl = rq->end_io_data;
739
740 blk_mq_free_request(rq);
741
742 if (status) {
743 dev_err(ctrl->device,
744 "failed nvme_keep_alive_end_io error=%d\n",
745 status);
746 return;
747 }
748
749 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
750 }
751
752 static int nvme_keep_alive(struct nvme_ctrl *ctrl)
753 {
754 struct nvme_command c;
755 struct request *rq;
756
757 memset(&c, 0, sizeof(c));
758 c.common.opcode = nvme_admin_keep_alive;
759
760 rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED,
761 NVME_QID_ANY);
762 if (IS_ERR(rq))
763 return PTR_ERR(rq);
764
765 rq->timeout = ctrl->kato * HZ;
766 rq->end_io_data = ctrl;
767
768 blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
769
770 return 0;
771 }
772
773 static void nvme_keep_alive_work(struct work_struct *work)
774 {
775 struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
776 struct nvme_ctrl, ka_work);
777
778 if (nvme_keep_alive(ctrl)) {
779 /* allocation failure, reset the controller */
780 dev_err(ctrl->device, "keep-alive failed\n");
781 nvme_reset_ctrl(ctrl);
782 return;
783 }
784 }
785
786 void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
787 {
788 if (unlikely(ctrl->kato == 0))
789 return;
790
791 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
792 schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
793 }
794 EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
795
796 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
797 {
798 if (unlikely(ctrl->kato == 0))
799 return;
800
801 cancel_delayed_work_sync(&ctrl->ka_work);
802 }
803 EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
804
805 static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
806 {
807 struct nvme_command c = { };
808 int error;
809
810 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
811 c.identify.opcode = nvme_admin_identify;
812 c.identify.cns = NVME_ID_CNS_CTRL;
813
814 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
815 if (!*id)
816 return -ENOMEM;
817
818 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
819 sizeof(struct nvme_id_ctrl));
820 if (error)
821 kfree(*id);
822 return error;
823 }
824
825 static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
826 struct nvme_ns_ids *ids)
827 {
828 struct nvme_command c = { };
829 int status;
830 void *data;
831 int pos;
832 int len;
833
834 c.identify.opcode = nvme_admin_identify;
835 c.identify.nsid = cpu_to_le32(nsid);
836 c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
837
838 data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
839 if (!data)
840 return -ENOMEM;
841
842 status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
843 NVME_IDENTIFY_DATA_SIZE);
844 if (status)
845 goto free_data;
846
847 for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
848 struct nvme_ns_id_desc *cur = data + pos;
849
850 if (cur->nidl == 0)
851 break;
852
853 switch (cur->nidt) {
854 case NVME_NIDT_EUI64:
855 if (cur->nidl != NVME_NIDT_EUI64_LEN) {
856 dev_warn(ctrl->device,
857 "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n",
858 cur->nidl);
859 goto free_data;
860 }
861 len = NVME_NIDT_EUI64_LEN;
862 memcpy(ids->eui64, data + pos + sizeof(*cur), len);
863 break;
864 case NVME_NIDT_NGUID:
865 if (cur->nidl != NVME_NIDT_NGUID_LEN) {
866 dev_warn(ctrl->device,
867 "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n",
868 cur->nidl);
869 goto free_data;
870 }
871 len = NVME_NIDT_NGUID_LEN;
872 memcpy(ids->nguid, data + pos + sizeof(*cur), len);
873 break;
874 case NVME_NIDT_UUID:
875 if (cur->nidl != NVME_NIDT_UUID_LEN) {
876 dev_warn(ctrl->device,
877 "ctrl returned bogus length: %d for NVME_NIDT_UUID\n",
878 cur->nidl);
879 goto free_data;
880 }
881 len = NVME_NIDT_UUID_LEN;
882 uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
883 break;
884 default:
885 /* Skip unnkown types */
886 len = cur->nidl;
887 break;
888 }
889
890 len += sizeof(*cur);
891 }
892 free_data:
893 kfree(data);
894 return status;
895 }
896
897 static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
898 {
899 struct nvme_command c = { };
900
901 c.identify.opcode = nvme_admin_identify;
902 c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST;
903 c.identify.nsid = cpu_to_le32(nsid);
904 return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
905 }
906
907 static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
908 unsigned nsid)
909 {
910 struct nvme_id_ns *id;
911 struct nvme_command c = { };
912 int error;
913
914 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
915 c.identify.opcode = nvme_admin_identify;
916 c.identify.nsid = cpu_to_le32(nsid);
917 c.identify.cns = NVME_ID_CNS_NS;
918
919 id = kmalloc(sizeof(*id), GFP_KERNEL);
920 if (!id)
921 return NULL;
922
923 error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
924 if (error) {
925 dev_warn(ctrl->device, "Identify namespace failed\n");
926 kfree(id);
927 return NULL;
928 }
929
930 return id;
931 }
932
933 static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
934 void *buffer, size_t buflen, u32 *result)
935 {
936 struct nvme_command c;
937 union nvme_result res;
938 int ret;
939
940 memset(&c, 0, sizeof(c));
941 c.features.opcode = nvme_admin_set_features;
942 c.features.fid = cpu_to_le32(fid);
943 c.features.dword11 = cpu_to_le32(dword11);
944
945 ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
946 buffer, buflen, 0, NVME_QID_ANY, 0, 0);
947 if (ret >= 0 && result)
948 *result = le32_to_cpu(res.u32);
949 return ret;
950 }
951
952 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
953 {
954 u32 q_count = (*count - 1) | ((*count - 1) << 16);
955 u32 result;
956 int status, nr_io_queues;
957
958 status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
959 &result);
960 if (status < 0)
961 return status;
962
963 /*
964 * Degraded controllers might return an error when setting the queue
965 * count. We still want to be able to bring them online and offer
966 * access to the admin queue, as that might be only way to fix them up.
967 */
968 if (status > 0) {
969 dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
970 *count = 0;
971 } else {
972 nr_io_queues = min(result & 0xffff, result >> 16) + 1;
973 *count = min(*count, nr_io_queues);
974 }
975
976 return 0;
977 }
978 EXPORT_SYMBOL_GPL(nvme_set_queue_count);
979
980 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
981 {
982 struct nvme_user_io io;
983 struct nvme_command c;
984 unsigned length, meta_len;
985 void __user *metadata;
986
987 if (copy_from_user(&io, uio, sizeof(io)))
988 return -EFAULT;
989 if (io.flags)
990 return -EINVAL;
991
992 switch (io.opcode) {
993 case nvme_cmd_write:
994 case nvme_cmd_read:
995 case nvme_cmd_compare:
996 break;
997 default:
998 return -EINVAL;
999 }
1000
1001 length = (io.nblocks + 1) << ns->lba_shift;
1002 meta_len = (io.nblocks + 1) * ns->ms;
1003 metadata = (void __user *)(uintptr_t)io.metadata;
1004
1005 if (ns->ext) {
1006 length += meta_len;
1007 meta_len = 0;
1008 } else if (meta_len) {
1009 if ((io.metadata & 3) || !io.metadata)
1010 return -EINVAL;
1011 }
1012
1013 memset(&c, 0, sizeof(c));
1014 c.rw.opcode = io.opcode;
1015 c.rw.flags = io.flags;
1016 c.rw.nsid = cpu_to_le32(ns->head->ns_id);
1017 c.rw.slba = cpu_to_le64(io.slba);
1018 c.rw.length = cpu_to_le16(io.nblocks);
1019 c.rw.control = cpu_to_le16(io.control);
1020 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
1021 c.rw.reftag = cpu_to_le32(io.reftag);
1022 c.rw.apptag = cpu_to_le16(io.apptag);
1023 c.rw.appmask = cpu_to_le16(io.appmask);
1024
1025 return nvme_submit_user_cmd(ns->queue, &c,
1026 (void __user *)(uintptr_t)io.addr, length,
1027 metadata, meta_len, io.slba, NULL, 0);
1028 }
1029
1030 static u32 nvme_known_admin_effects(u8 opcode)
1031 {
1032 switch (opcode) {
1033 case nvme_admin_format_nvm:
1034 return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
1035 NVME_CMD_EFFECTS_CSE_MASK;
1036 case nvme_admin_sanitize_nvm:
1037 return NVME_CMD_EFFECTS_CSE_MASK;
1038 default:
1039 break;
1040 }
1041 return 0;
1042 }
1043
1044 static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1045 u8 opcode)
1046 {
1047 u32 effects = 0;
1048
1049 if (ns) {
1050 if (ctrl->effects)
1051 effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
1052 if (effects & ~NVME_CMD_EFFECTS_CSUPP)
1053 dev_warn(ctrl->device,
1054 "IO command:%02x has unhandled effects:%08x\n",
1055 opcode, effects);
1056 return 0;
1057 }
1058
1059 if (ctrl->effects)
1060 effects = le32_to_cpu(ctrl->effects->acs[opcode]);
1061 else
1062 effects = nvme_known_admin_effects(opcode);
1063
1064 /*
1065 * For simplicity, IO to all namespaces is quiesced even if the command
1066 * effects say only one namespace is affected.
1067 */
1068 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
1069 nvme_start_freeze(ctrl);
1070 nvme_wait_freeze(ctrl);
1071 }
1072 return effects;
1073 }
1074
1075 static void nvme_update_formats(struct nvme_ctrl *ctrl)
1076 {
1077 struct nvme_ns *ns;
1078
1079 mutex_lock(&ctrl->namespaces_mutex);
1080 list_for_each_entry(ns, &ctrl->namespaces, list) {
1081 if (ns->disk && nvme_revalidate_disk(ns->disk))
1082 nvme_ns_remove(ns);
1083 }
1084 mutex_unlock(&ctrl->namespaces_mutex);
1085 }
1086
1087 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
1088 {
1089 /*
1090 * Revalidate LBA changes prior to unfreezing. This is necessary to
1091 * prevent memory corruption if a logical block size was changed by
1092 * this command.
1093 */
1094 if (effects & NVME_CMD_EFFECTS_LBCC)
1095 nvme_update_formats(ctrl);
1096 if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
1097 nvme_unfreeze(ctrl);
1098 if (effects & NVME_CMD_EFFECTS_CCC)
1099 nvme_init_identify(ctrl);
1100 if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
1101 nvme_queue_scan(ctrl);
1102 }
1103
1104 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
1105 struct nvme_passthru_cmd __user *ucmd)
1106 {
1107 struct nvme_passthru_cmd cmd;
1108 struct nvme_command c;
1109 unsigned timeout = 0;
1110 u32 effects;
1111 int status;
1112
1113 if (!capable(CAP_SYS_ADMIN))
1114 return -EACCES;
1115 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1116 return -EFAULT;
1117 if (cmd.flags)
1118 return -EINVAL;
1119
1120 memset(&c, 0, sizeof(c));
1121 c.common.opcode = cmd.opcode;
1122 c.common.flags = cmd.flags;
1123 c.common.nsid = cpu_to_le32(cmd.nsid);
1124 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
1125 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
1126 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
1127 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
1128 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
1129 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
1130 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
1131 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
1132
1133 if (cmd.timeout_ms)
1134 timeout = msecs_to_jiffies(cmd.timeout_ms);
1135
1136 effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
1137 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
1138 (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
1139 (void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
1140 0, &cmd.result, timeout);
1141 nvme_passthru_end(ctrl, effects);
1142
1143 if (status >= 0) {
1144 if (put_user(cmd.result, &ucmd->result))
1145 return -EFAULT;
1146 }
1147
1148 return status;
1149 }
1150
1151 /*
1152 * Issue ioctl requests on the first available path. Note that unlike normal
1153 * block layer requests we will not retry failed request on another controller.
1154 */
1155 static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
1156 struct nvme_ns_head **head, int *srcu_idx)
1157 {
1158 #ifdef CONFIG_NVME_MULTIPATH
1159 if (disk->fops == &nvme_ns_head_ops) {
1160 *head = disk->private_data;
1161 *srcu_idx = srcu_read_lock(&(*head)->srcu);
1162 return nvme_find_path(*head);
1163 }
1164 #endif
1165 *head = NULL;
1166 *srcu_idx = -1;
1167 return disk->private_data;
1168 }
1169
1170 static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
1171 {
1172 if (head)
1173 srcu_read_unlock(&head->srcu, idx);
1174 }
1175
1176 static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
1177 {
1178 switch (cmd) {
1179 case NVME_IOCTL_ID:
1180 force_successful_syscall_return();
1181 return ns->head->ns_id;
1182 case NVME_IOCTL_ADMIN_CMD:
1183 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
1184 case NVME_IOCTL_IO_CMD:
1185 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
1186 case NVME_IOCTL_SUBMIT_IO:
1187 return nvme_submit_io(ns, (void __user *)arg);
1188 default:
1189 #ifdef CONFIG_NVM
1190 if (ns->ndev)
1191 return nvme_nvm_ioctl(ns, cmd, arg);
1192 #endif
1193 if (is_sed_ioctl(cmd))
1194 return sed_ioctl(ns->ctrl->opal_dev, cmd,
1195 (void __user *) arg);
1196 return -ENOTTY;
1197 }
1198 }
1199
1200 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
1201 unsigned int cmd, unsigned long arg)
1202 {
1203 struct nvme_ns_head *head = NULL;
1204 struct nvme_ns *ns;
1205 int srcu_idx, ret;
1206
1207 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1208 if (unlikely(!ns))
1209 ret = -EWOULDBLOCK;
1210 else
1211 ret = nvme_ns_ioctl(ns, cmd, arg);
1212 nvme_put_ns_from_disk(head, srcu_idx);
1213 return ret;
1214 }
1215
1216 static int nvme_open(struct block_device *bdev, fmode_t mode)
1217 {
1218 struct nvme_ns *ns = bdev->bd_disk->private_data;
1219
1220 #ifdef CONFIG_NVME_MULTIPATH
1221 /* should never be called due to GENHD_FL_HIDDEN */
1222 if (WARN_ON_ONCE(ns->head->disk))
1223 return -ENXIO;
1224 #endif
1225 if (!kref_get_unless_zero(&ns->kref))
1226 return -ENXIO;
1227 return 0;
1228 }
1229
1230 static void nvme_release(struct gendisk *disk, fmode_t mode)
1231 {
1232 nvme_put_ns(disk->private_data);
1233 }
1234
1235 static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
1236 {
1237 /* some standard values */
1238 geo->heads = 1 << 6;
1239 geo->sectors = 1 << 5;
1240 geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
1241 return 0;
1242 }
1243
1244 #ifdef CONFIG_BLK_DEV_INTEGRITY
1245 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1246 {
1247 struct blk_integrity integrity;
1248
1249 memset(&integrity, 0, sizeof(integrity));
1250 switch (pi_type) {
1251 case NVME_NS_DPS_PI_TYPE3:
1252 integrity.profile = &t10_pi_type3_crc;
1253 integrity.tag_size = sizeof(u16) + sizeof(u32);
1254 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1255 break;
1256 case NVME_NS_DPS_PI_TYPE1:
1257 case NVME_NS_DPS_PI_TYPE2:
1258 integrity.profile = &t10_pi_type1_crc;
1259 integrity.tag_size = sizeof(u16);
1260 integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
1261 break;
1262 default:
1263 integrity.profile = NULL;
1264 break;
1265 }
1266 integrity.tuple_size = ms;
1267 blk_integrity_register(disk, &integrity);
1268 blk_queue_max_integrity_segments(disk->queue, 1);
1269 }
1270 #else
1271 static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
1272 {
1273 }
1274 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1275
1276 static void nvme_set_chunk_size(struct nvme_ns *ns)
1277 {
1278 u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
1279 blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
1280 }
1281
1282 static void nvme_config_discard(struct nvme_ctrl *ctrl,
1283 unsigned stream_alignment, struct request_queue *queue)
1284 {
1285 u32 size = queue_logical_block_size(queue);
1286
1287 if (stream_alignment)
1288 size *= stream_alignment;
1289
1290 BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
1291 NVME_DSM_MAX_RANGES);
1292
1293 queue->limits.discard_alignment = 0;
1294 queue->limits.discard_granularity = size;
1295
1296 blk_queue_max_discard_sectors(queue, UINT_MAX);
1297 blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
1298 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
1299
1300 if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
1301 blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
1302 }
1303
1304 static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
1305 struct nvme_id_ns *id, struct nvme_ns_ids *ids)
1306 {
1307 memset(ids, 0, sizeof(*ids));
1308
1309 if (ctrl->vs >= NVME_VS(1, 1, 0))
1310 memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
1311 if (ctrl->vs >= NVME_VS(1, 2, 0))
1312 memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
1313 if (ctrl->vs >= NVME_VS(1, 3, 0)) {
1314 /* Don't treat error as fatal we potentially
1315 * already have a NGUID or EUI-64
1316 */
1317 if (nvme_identify_ns_descs(ctrl, nsid, ids))
1318 dev_warn(ctrl->device,
1319 "%s: Identify Descriptors failed\n", __func__);
1320 }
1321 }
1322
1323 static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
1324 {
1325 return !uuid_is_null(&ids->uuid) ||
1326 memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
1327 memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
1328 }
1329
1330 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
1331 {
1332 return uuid_equal(&a->uuid, &b->uuid) &&
1333 memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
1334 memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
1335 }
1336
1337 static void nvme_update_disk_info(struct gendisk *disk,
1338 struct nvme_ns *ns, struct nvme_id_ns *id)
1339 {
1340 sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
1341 unsigned short bs = 1 << ns->lba_shift;
1342 unsigned stream_alignment = 0;
1343
1344 if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
1345 stream_alignment = ns->sws * ns->sgs;
1346
1347 blk_mq_freeze_queue(disk->queue);
1348 blk_integrity_unregister(disk);
1349
1350 blk_queue_logical_block_size(disk->queue, bs);
1351 blk_queue_physical_block_size(disk->queue, bs);
1352 blk_queue_io_min(disk->queue, bs);
1353
1354 if (ns->ms && !ns->ext &&
1355 (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
1356 nvme_init_integrity(disk, ns->ms, ns->pi_type);
1357 if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
1358 capacity = 0;
1359 set_capacity(disk, capacity);
1360
1361 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
1362 nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
1363 blk_mq_unfreeze_queue(disk->queue);
1364 }
1365
1366 static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
1367 {
1368 struct nvme_ns *ns = disk->private_data;
1369
1370 /*
1371 * If identify namespace failed, use default 512 byte block size so
1372 * block layer can use before failing read/write for 0 capacity.
1373 */
1374 ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
1375 if (ns->lba_shift == 0)
1376 ns->lba_shift = 9;
1377 ns->noiob = le16_to_cpu(id->noiob);
1378 ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
1379 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
1380 /* the PI implementation requires metadata equal t10 pi tuple size */
1381 if (ns->ms == sizeof(struct t10_pi_tuple))
1382 ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
1383 else
1384 ns->pi_type = 0;
1385
1386 if (ns->noiob)
1387 nvme_set_chunk_size(ns);
1388 nvme_update_disk_info(disk, ns, id);
1389 #ifdef CONFIG_NVME_MULTIPATH
1390 if (ns->head->disk)
1391 nvme_update_disk_info(ns->head->disk, ns, id);
1392 #endif
1393 }
1394
1395 static int nvme_revalidate_disk(struct gendisk *disk)
1396 {
1397 struct nvme_ns *ns = disk->private_data;
1398 struct nvme_ctrl *ctrl = ns->ctrl;
1399 struct nvme_id_ns *id;
1400 struct nvme_ns_ids ids;
1401 int ret = 0;
1402
1403 if (test_bit(NVME_NS_DEAD, &ns->flags)) {
1404 set_capacity(disk, 0);
1405 return -ENODEV;
1406 }
1407
1408 id = nvme_identify_ns(ctrl, ns->head->ns_id);
1409 if (!id)
1410 return -ENODEV;
1411
1412 if (id->ncap == 0) {
1413 ret = -ENODEV;
1414 goto out;
1415 }
1416
1417 __nvme_revalidate_disk(disk, id);
1418 nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
1419 if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
1420 dev_err(ctrl->device,
1421 "identifiers changed for nsid %d\n", ns->head->ns_id);
1422 ret = -ENODEV;
1423 }
1424
1425 out:
1426 kfree(id);
1427 return ret;
1428 }
1429
1430 static char nvme_pr_type(enum pr_type type)
1431 {
1432 switch (type) {
1433 case PR_WRITE_EXCLUSIVE:
1434 return 1;
1435 case PR_EXCLUSIVE_ACCESS:
1436 return 2;
1437 case PR_WRITE_EXCLUSIVE_REG_ONLY:
1438 return 3;
1439 case PR_EXCLUSIVE_ACCESS_REG_ONLY:
1440 return 4;
1441 case PR_WRITE_EXCLUSIVE_ALL_REGS:
1442 return 5;
1443 case PR_EXCLUSIVE_ACCESS_ALL_REGS:
1444 return 6;
1445 default:
1446 return 0;
1447 }
1448 };
1449
1450 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
1451 u64 key, u64 sa_key, u8 op)
1452 {
1453 struct nvme_ns_head *head = NULL;
1454 struct nvme_ns *ns;
1455 struct nvme_command c;
1456 int srcu_idx, ret;
1457 u8 data[16] = { 0, };
1458
1459 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
1460 if (unlikely(!ns))
1461 return -EWOULDBLOCK;
1462
1463 put_unaligned_le64(key, &data[0]);
1464 put_unaligned_le64(sa_key, &data[8]);
1465
1466 memset(&c, 0, sizeof(c));
1467 c.common.opcode = op;
1468 c.common.nsid = cpu_to_le32(ns->head->ns_id);
1469 c.common.cdw10[0] = cpu_to_le32(cdw10);
1470
1471 ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
1472 nvme_put_ns_from_disk(head, srcu_idx);
1473 return ret;
1474 }
1475
1476 static int nvme_pr_register(struct block_device *bdev, u64 old,
1477 u64 new, unsigned flags)
1478 {
1479 u32 cdw10;
1480
1481 if (flags & ~PR_FL_IGNORE_KEY)
1482 return -EOPNOTSUPP;
1483
1484 cdw10 = old ? 2 : 0;
1485 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
1486 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
1487 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
1488 }
1489
1490 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
1491 enum pr_type type, unsigned flags)
1492 {
1493 u32 cdw10;
1494
1495 if (flags & ~PR_FL_IGNORE_KEY)
1496 return -EOPNOTSUPP;
1497
1498 cdw10 = nvme_pr_type(type) << 8;
1499 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
1500 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
1501 }
1502
1503 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
1504 enum pr_type type, bool abort)
1505 {
1506 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
1507 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
1508 }
1509
1510 static int nvme_pr_clear(struct block_device *bdev, u64 key)
1511 {
1512 u32 cdw10 = 1 | (key ? 1 << 3 : 0);
1513 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
1514 }
1515
1516 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
1517 {
1518 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
1519 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
1520 }
1521
1522 static const struct pr_ops nvme_pr_ops = {
1523 .pr_register = nvme_pr_register,
1524 .pr_reserve = nvme_pr_reserve,
1525 .pr_release = nvme_pr_release,
1526 .pr_preempt = nvme_pr_preempt,
1527 .pr_clear = nvme_pr_clear,
1528 };
1529
1530 #ifdef CONFIG_BLK_SED_OPAL
1531 int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
1532 bool send)
1533 {
1534 struct nvme_ctrl *ctrl = data;
1535 struct nvme_command cmd;
1536
1537 memset(&cmd, 0, sizeof(cmd));
1538 if (send)
1539 cmd.common.opcode = nvme_admin_security_send;
1540 else
1541 cmd.common.opcode = nvme_admin_security_recv;
1542 cmd.common.nsid = 0;
1543 cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
1544 cmd.common.cdw10[1] = cpu_to_le32(len);
1545
1546 return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
1547 ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
1548 }
1549 EXPORT_SYMBOL_GPL(nvme_sec_submit);
1550 #endif /* CONFIG_BLK_SED_OPAL */
1551
1552 static const struct block_device_operations nvme_fops = {
1553 .owner = THIS_MODULE,
1554 .ioctl = nvme_ioctl,
1555 .compat_ioctl = nvme_ioctl,
1556 .open = nvme_open,
1557 .release = nvme_release,
1558 .getgeo = nvme_getgeo,
1559 .revalidate_disk= nvme_revalidate_disk,
1560 .pr_ops = &nvme_pr_ops,
1561 };
1562
1563 #ifdef CONFIG_NVME_MULTIPATH
1564 static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
1565 {
1566 struct nvme_ns_head *head = bdev->bd_disk->private_data;
1567
1568 if (!kref_get_unless_zero(&head->ref))
1569 return -ENXIO;
1570 return 0;
1571 }
1572
1573 static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
1574 {
1575 nvme_put_ns_head(disk->private_data);
1576 }
1577
1578 const struct block_device_operations nvme_ns_head_ops = {
1579 .owner = THIS_MODULE,
1580 .open = nvme_ns_head_open,
1581 .release = nvme_ns_head_release,
1582 .ioctl = nvme_ioctl,
1583 .compat_ioctl = nvme_ioctl,
1584 .getgeo = nvme_getgeo,
1585 .pr_ops = &nvme_pr_ops,
1586 };
1587 #endif /* CONFIG_NVME_MULTIPATH */
1588
1589 static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
1590 {
1591 unsigned long timeout =
1592 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
1593 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
1594 int ret;
1595
1596 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1597 if (csts == ~0)
1598 return -ENODEV;
1599 if ((csts & NVME_CSTS_RDY) == bit)
1600 break;
1601
1602 msleep(100);
1603 if (fatal_signal_pending(current))
1604 return -EINTR;
1605 if (time_after(jiffies, timeout)) {
1606 dev_err(ctrl->device,
1607 "Device not ready; aborting %s\n", enabled ?
1608 "initialisation" : "reset");
1609 return -ENODEV;
1610 }
1611 }
1612
1613 return ret;
1614 }
1615
1616 /*
1617 * If the device has been passed off to us in an enabled state, just clear
1618 * the enabled bit. The spec says we should set the 'shutdown notification
1619 * bits', but doing so may cause the device to complete commands to the
1620 * admin queue ... and we don't know what memory that might be pointing at!
1621 */
1622 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1623 {
1624 int ret;
1625
1626 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1627 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
1628
1629 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1630 if (ret)
1631 return ret;
1632
1633 if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
1634 msleep(NVME_QUIRK_DELAY_AMOUNT);
1635
1636 return nvme_wait_ready(ctrl, cap, false);
1637 }
1638 EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
1639
1640 int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
1641 {
1642 /*
1643 * Default to a 4K page size, with the intention to update this
1644 * path in the future to accomodate architectures with differing
1645 * kernel and IO page sizes.
1646 */
1647 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
1648 int ret;
1649
1650 if (page_shift < dev_page_min) {
1651 dev_err(ctrl->device,
1652 "Minimum device page size %u too large for host (%u)\n",
1653 1 << dev_page_min, 1 << page_shift);
1654 return -ENODEV;
1655 }
1656
1657 ctrl->page_size = 1 << page_shift;
1658
1659 ctrl->ctrl_config = NVME_CC_CSS_NVM;
1660 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
1661 ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
1662 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
1663 ctrl->ctrl_config |= NVME_CC_ENABLE;
1664
1665 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1666 if (ret)
1667 return ret;
1668 return nvme_wait_ready(ctrl, cap, true);
1669 }
1670 EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
1671
1672 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
1673 {
1674 unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
1675 u32 csts;
1676 int ret;
1677
1678 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
1679 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
1680
1681 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
1682 if (ret)
1683 return ret;
1684
1685 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
1686 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
1687 break;
1688
1689 msleep(100);
1690 if (fatal_signal_pending(current))
1691 return -EINTR;
1692 if (time_after(jiffies, timeout)) {
1693 dev_err(ctrl->device,
1694 "Device shutdown incomplete; abort shutdown\n");
1695 return -ENODEV;
1696 }
1697 }
1698
1699 return ret;
1700 }
1701 EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
1702
1703 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
1704 struct request_queue *q)
1705 {
1706 bool vwc = false;
1707
1708 if (ctrl->max_hw_sectors) {
1709 u32 max_segments =
1710 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
1711
1712 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
1713 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
1714 }
1715 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
1716 is_power_of_2(ctrl->max_hw_sectors))
1717 blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
1718 blk_queue_virt_boundary(q, ctrl->page_size - 1);
1719 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
1720 vwc = true;
1721 blk_queue_write_cache(q, vwc, vwc);
1722 }
1723
1724 static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
1725 {
1726 __le64 ts;
1727 int ret;
1728
1729 if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
1730 return 0;
1731
1732 ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
1733 ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
1734 NULL);
1735 if (ret)
1736 dev_warn_once(ctrl->device,
1737 "could not set timestamp (%d)\n", ret);
1738 return ret;
1739 }
1740
1741 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
1742 {
1743 /*
1744 * APST (Autonomous Power State Transition) lets us program a
1745 * table of power state transitions that the controller will
1746 * perform automatically. We configure it with a simple
1747 * heuristic: we are willing to spend at most 2% of the time
1748 * transitioning between power states. Therefore, when running
1749 * in any given state, we will enter the next lower-power
1750 * non-operational state after waiting 50 * (enlat + exlat)
1751 * microseconds, as long as that state's exit latency is under
1752 * the requested maximum latency.
1753 *
1754 * We will not autonomously enter any non-operational state for
1755 * which the total latency exceeds ps_max_latency_us. Users
1756 * can set ps_max_latency_us to zero to turn off APST.
1757 */
1758
1759 unsigned apste;
1760 struct nvme_feat_auto_pst *table;
1761 u64 max_lat_us = 0;
1762 int max_ps = -1;
1763 int ret;
1764
1765 /*
1766 * If APST isn't supported or if we haven't been initialized yet,
1767 * then don't do anything.
1768 */
1769 if (!ctrl->apsta)
1770 return 0;
1771
1772 if (ctrl->npss > 31) {
1773 dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
1774 return 0;
1775 }
1776
1777 table = kzalloc(sizeof(*table), GFP_KERNEL);
1778 if (!table)
1779 return 0;
1780
1781 if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
1782 /* Turn off APST. */
1783 apste = 0;
1784 dev_dbg(ctrl->device, "APST disabled\n");
1785 } else {
1786 __le64 target = cpu_to_le64(0);
1787 int state;
1788
1789 /*
1790 * Walk through all states from lowest- to highest-power.
1791 * According to the spec, lower-numbered states use more
1792 * power. NPSS, despite the name, is the index of the
1793 * lowest-power state, not the number of states.
1794 */
1795 for (state = (int)ctrl->npss; state >= 0; state--) {
1796 u64 total_latency_us, exit_latency_us, transition_ms;
1797
1798 if (target)
1799 table->entries[state] = target;
1800
1801 /*
1802 * Don't allow transitions to the deepest state
1803 * if it's quirked off.
1804 */
1805 if (state == ctrl->npss &&
1806 (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
1807 continue;
1808
1809 /*
1810 * Is this state a useful non-operational state for
1811 * higher-power states to autonomously transition to?
1812 */
1813 if (!(ctrl->psd[state].flags &
1814 NVME_PS_FLAGS_NON_OP_STATE))
1815 continue;
1816
1817 exit_latency_us =
1818 (u64)le32_to_cpu(ctrl->psd[state].exit_lat);
1819 if (exit_latency_us > ctrl->ps_max_latency_us)
1820 continue;
1821
1822 total_latency_us =
1823 exit_latency_us +
1824 le32_to_cpu(ctrl->psd[state].entry_lat);
1825
1826 /*
1827 * This state is good. Use it as the APST idle
1828 * target for higher power states.
1829 */
1830 transition_ms = total_latency_us + 19;
1831 do_div(transition_ms, 20);
1832 if (transition_ms > (1 << 24) - 1)
1833 transition_ms = (1 << 24) - 1;
1834
1835 target = cpu_to_le64((state << 3) |
1836 (transition_ms << 8));
1837
1838 if (max_ps == -1)
1839 max_ps = state;
1840
1841 if (total_latency_us > max_lat_us)
1842 max_lat_us = total_latency_us;
1843 }
1844
1845 apste = 1;
1846
1847 if (max_ps == -1) {
1848 dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
1849 } else {
1850 dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
1851 max_ps, max_lat_us, (int)sizeof(*table), table);
1852 }
1853 }
1854
1855 ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
1856 table, sizeof(*table), NULL);
1857 if (ret)
1858 dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
1859
1860 kfree(table);
1861 return ret;
1862 }
1863
1864 static void nvme_set_latency_tolerance(struct device *dev, s32 val)
1865 {
1866 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1867 u64 latency;
1868
1869 switch (val) {
1870 case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
1871 case PM_QOS_LATENCY_ANY:
1872 latency = U64_MAX;
1873 break;
1874
1875 default:
1876 latency = val;
1877 }
1878
1879 if (ctrl->ps_max_latency_us != latency) {
1880 ctrl->ps_max_latency_us = latency;
1881 nvme_configure_apst(ctrl);
1882 }
1883 }
1884
1885 struct nvme_core_quirk_entry {
1886 /*
1887 * NVMe model and firmware strings are padded with spaces. For
1888 * simplicity, strings in the quirk table are padded with NULLs
1889 * instead.
1890 */
1891 u16 vid;
1892 const char *mn;
1893 const char *fr;
1894 unsigned long quirks;
1895 };
1896
1897 static const struct nvme_core_quirk_entry core_quirks[] = {
1898 {
1899 /*
1900 * This Toshiba device seems to die using any APST states. See:
1901 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
1902 */
1903 .vid = 0x1179,
1904 .mn = "THNSF5256GPUK TOSHIBA",
1905 .quirks = NVME_QUIRK_NO_APST,
1906 }
1907 };
1908
1909 /* match is null-terminated but idstr is space-padded. */
1910 static bool string_matches(const char *idstr, const char *match, size_t len)
1911 {
1912 size_t matchlen;
1913
1914 if (!match)
1915 return true;
1916
1917 matchlen = strlen(match);
1918 WARN_ON_ONCE(matchlen > len);
1919
1920 if (memcmp(idstr, match, matchlen))
1921 return false;
1922
1923 for (; matchlen < len; matchlen++)
1924 if (idstr[matchlen] != ' ')
1925 return false;
1926
1927 return true;
1928 }
1929
1930 static bool quirk_matches(const struct nvme_id_ctrl *id,
1931 const struct nvme_core_quirk_entry *q)
1932 {
1933 return q->vid == le16_to_cpu(id->vid) &&
1934 string_matches(id->mn, q->mn, sizeof(id->mn)) &&
1935 string_matches(id->fr, q->fr, sizeof(id->fr));
1936 }
1937
1938 static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
1939 struct nvme_id_ctrl *id)
1940 {
1941 size_t nqnlen;
1942 int off;
1943
1944 if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
1945 nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
1946 if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
1947 strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
1948 return;
1949 }
1950
1951 if (ctrl->vs >= NVME_VS(1, 2, 1))
1952 dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
1953 }
1954
1955 /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
1956 off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
1957 "nqn.2014.08.org.nvmexpress:%04x%04x",
1958 le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
1959 memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
1960 off += sizeof(id->sn);
1961 memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
1962 off += sizeof(id->mn);
1963 memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
1964 }
1965
1966 static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
1967 {
1968 ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
1969 kfree(subsys);
1970 }
1971
1972 static void nvme_release_subsystem(struct device *dev)
1973 {
1974 __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
1975 }
1976
1977 static void nvme_destroy_subsystem(struct kref *ref)
1978 {
1979 struct nvme_subsystem *subsys =
1980 container_of(ref, struct nvme_subsystem, ref);
1981
1982 mutex_lock(&nvme_subsystems_lock);
1983 list_del(&subsys->entry);
1984 mutex_unlock(&nvme_subsystems_lock);
1985
1986 ida_destroy(&subsys->ns_ida);
1987 device_del(&subsys->dev);
1988 put_device(&subsys->dev);
1989 }
1990
1991 static void nvme_put_subsystem(struct nvme_subsystem *subsys)
1992 {
1993 kref_put(&subsys->ref, nvme_destroy_subsystem);
1994 }
1995
1996 static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
1997 {
1998 struct nvme_subsystem *subsys;
1999
2000 lockdep_assert_held(&nvme_subsystems_lock);
2001
2002 list_for_each_entry(subsys, &nvme_subsystems, entry) {
2003 if (strcmp(subsys->subnqn, subsysnqn))
2004 continue;
2005 if (!kref_get_unless_zero(&subsys->ref))
2006 continue;
2007 return subsys;
2008 }
2009
2010 return NULL;
2011 }
2012
2013 #define SUBSYS_ATTR_RO(_name, _mode, _show) \
2014 struct device_attribute subsys_attr_##_name = \
2015 __ATTR(_name, _mode, _show, NULL)
2016
2017 static ssize_t nvme_subsys_show_nqn(struct device *dev,
2018 struct device_attribute *attr,
2019 char *buf)
2020 {
2021 struct nvme_subsystem *subsys =
2022 container_of(dev, struct nvme_subsystem, dev);
2023
2024 return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
2025 }
2026 static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
2027
2028 #define nvme_subsys_show_str_function(field) \
2029 static ssize_t subsys_##field##_show(struct device *dev, \
2030 struct device_attribute *attr, char *buf) \
2031 { \
2032 struct nvme_subsystem *subsys = \
2033 container_of(dev, struct nvme_subsystem, dev); \
2034 return sprintf(buf, "%.*s\n", \
2035 (int)sizeof(subsys->field), subsys->field); \
2036 } \
2037 static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
2038
2039 nvme_subsys_show_str_function(model);
2040 nvme_subsys_show_str_function(serial);
2041 nvme_subsys_show_str_function(firmware_rev);
2042
2043 static struct attribute *nvme_subsys_attrs[] = {
2044 &subsys_attr_model.attr,
2045 &subsys_attr_serial.attr,
2046 &subsys_attr_firmware_rev.attr,
2047 &subsys_attr_subsysnqn.attr,
2048 NULL,
2049 };
2050
2051 static struct attribute_group nvme_subsys_attrs_group = {
2052 .attrs = nvme_subsys_attrs,
2053 };
2054
2055 static const struct attribute_group *nvme_subsys_attrs_groups[] = {
2056 &nvme_subsys_attrs_group,
2057 NULL,
2058 };
2059
2060 static int nvme_active_ctrls(struct nvme_subsystem *subsys)
2061 {
2062 int count = 0;
2063 struct nvme_ctrl *ctrl;
2064
2065 mutex_lock(&subsys->lock);
2066 list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
2067 if (ctrl->state != NVME_CTRL_DELETING &&
2068 ctrl->state != NVME_CTRL_DEAD)
2069 count++;
2070 }
2071 mutex_unlock(&subsys->lock);
2072
2073 return count;
2074 }
2075
2076 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
2077 {
2078 struct nvme_subsystem *subsys, *found;
2079 int ret;
2080
2081 subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
2082 if (!subsys)
2083 return -ENOMEM;
2084 ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
2085 if (ret < 0) {
2086 kfree(subsys);
2087 return ret;
2088 }
2089 subsys->instance = ret;
2090 mutex_init(&subsys->lock);
2091 kref_init(&subsys->ref);
2092 INIT_LIST_HEAD(&subsys->ctrls);
2093 INIT_LIST_HEAD(&subsys->nsheads);
2094 nvme_init_subnqn(subsys, ctrl, id);
2095 memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
2096 memcpy(subsys->model, id->mn, sizeof(subsys->model));
2097 memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
2098 subsys->vendor_id = le16_to_cpu(id->vid);
2099 subsys->cmic = id->cmic;
2100
2101 subsys->dev.class = nvme_subsys_class;
2102 subsys->dev.release = nvme_release_subsystem;
2103 subsys->dev.groups = nvme_subsys_attrs_groups;
2104 dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
2105 device_initialize(&subsys->dev);
2106
2107 mutex_lock(&nvme_subsystems_lock);
2108 found = __nvme_find_get_subsystem(subsys->subnqn);
2109 if (found) {
2110 /*
2111 * Verify that the subsystem actually supports multiple
2112 * controllers, else bail out.
2113 */
2114 if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
2115 dev_err(ctrl->device,
2116 "ignoring ctrl due to duplicate subnqn (%s).\n",
2117 found->subnqn);
2118 nvme_put_subsystem(found);
2119 ret = -EINVAL;
2120 goto out_unlock;
2121 }
2122
2123 __nvme_release_subsystem(subsys);
2124 subsys = found;
2125 } else {
2126 ret = device_add(&subsys->dev);
2127 if (ret) {
2128 dev_err(ctrl->device,
2129 "failed to register subsystem device.\n");
2130 goto out_unlock;
2131 }
2132 ida_init(&subsys->ns_ida);
2133 list_add_tail(&subsys->entry, &nvme_subsystems);
2134 }
2135
2136 ctrl->subsys = subsys;
2137 mutex_unlock(&nvme_subsystems_lock);
2138
2139 if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
2140 dev_name(ctrl->device))) {
2141 dev_err(ctrl->device,
2142 "failed to create sysfs link from subsystem.\n");
2143 /* the transport driver will eventually put the subsystem */
2144 return -EINVAL;
2145 }
2146
2147 mutex_lock(&subsys->lock);
2148 list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
2149 mutex_unlock(&subsys->lock);
2150
2151 return 0;
2152
2153 out_unlock:
2154 mutex_unlock(&nvme_subsystems_lock);
2155 put_device(&subsys->dev);
2156 return ret;
2157 }
2158
2159 static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
2160 size_t size)
2161 {
2162 struct nvme_command c = { };
2163
2164 c.common.opcode = nvme_admin_get_log_page;
2165 c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
2166 c.common.cdw10[0] = nvme_get_log_dw10(log_page, size);
2167
2168 return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
2169 }
2170
2171 static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
2172 {
2173 int ret;
2174
2175 if (!ctrl->effects)
2176 ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
2177
2178 if (!ctrl->effects)
2179 return 0;
2180
2181 ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
2182 sizeof(*ctrl->effects));
2183 if (ret) {
2184 kfree(ctrl->effects);
2185 ctrl->effects = NULL;
2186 }
2187 return ret;
2188 }
2189
2190 /*
2191 * Initialize the cached copies of the Identify data and various controller
2192 * register in our nvme_ctrl structure. This should be called as soon as
2193 * the admin queue is fully up and running.
2194 */
2195 int nvme_init_identify(struct nvme_ctrl *ctrl)
2196 {
2197 struct nvme_id_ctrl *id;
2198 u64 cap;
2199 int ret, page_shift;
2200 u32 max_hw_sectors;
2201 bool prev_apst_enabled;
2202
2203 ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
2204 if (ret) {
2205 dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
2206 return ret;
2207 }
2208
2209 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
2210 if (ret) {
2211 dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
2212 return ret;
2213 }
2214 page_shift = NVME_CAP_MPSMIN(cap) + 12;
2215
2216 if (ctrl->vs >= NVME_VS(1, 1, 0))
2217 ctrl->subsystem = NVME_CAP_NSSRC(cap);
2218
2219 ret = nvme_identify_ctrl(ctrl, &id);
2220 if (ret) {
2221 dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
2222 return -EIO;
2223 }
2224
2225 if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
2226 ret = nvme_get_effects_log(ctrl);
2227 if (ret < 0)
2228 return ret;
2229 }
2230
2231 if (!ctrl->identified) {
2232 int i;
2233
2234 ret = nvme_init_subsystem(ctrl, id);
2235 if (ret)
2236 goto out_free;
2237
2238 /*
2239 * Check for quirks. Quirk can depend on firmware version,
2240 * so, in principle, the set of quirks present can change
2241 * across a reset. As a possible future enhancement, we
2242 * could re-scan for quirks every time we reinitialize
2243 * the device, but we'd have to make sure that the driver
2244 * behaves intelligently if the quirks change.
2245 */
2246 for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
2247 if (quirk_matches(id, &core_quirks[i]))
2248 ctrl->quirks |= core_quirks[i].quirks;
2249 }
2250 }
2251
2252 if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
2253 dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
2254 ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
2255 }
2256
2257 ctrl->oacs = le16_to_cpu(id->oacs);
2258 ctrl->oncs = le16_to_cpup(&id->oncs);
2259 atomic_set(&ctrl->abort_limit, id->acl + 1);
2260 ctrl->vwc = id->vwc;
2261 ctrl->cntlid = le16_to_cpup(&id->cntlid);
2262 if (id->mdts)
2263 max_hw_sectors = 1 << (id->mdts + page_shift - 9);
2264 else
2265 max_hw_sectors = UINT_MAX;
2266 ctrl->max_hw_sectors =
2267 min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
2268
2269 nvme_set_queue_limits(ctrl, ctrl->admin_q);
2270 ctrl->sgls = le32_to_cpu(id->sgls);
2271 ctrl->kas = le16_to_cpu(id->kas);
2272
2273 if (id->rtd3e) {
2274 /* us -> s */
2275 u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
2276
2277 ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
2278 shutdown_timeout, 60);
2279
2280 if (ctrl->shutdown_timeout != shutdown_timeout)
2281 dev_warn(ctrl->device,
2282 "Shutdown timeout set to %u seconds\n",
2283 ctrl->shutdown_timeout);
2284 } else
2285 ctrl->shutdown_timeout = shutdown_timeout;
2286
2287 ctrl->npss = id->npss;
2288 ctrl->apsta = id->apsta;
2289 prev_apst_enabled = ctrl->apst_enabled;
2290 if (ctrl->quirks & NVME_QUIRK_NO_APST) {
2291 if (force_apst && id->apsta) {
2292 dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
2293 ctrl->apst_enabled = true;
2294 } else {
2295 ctrl->apst_enabled = false;
2296 }
2297 } else {
2298 ctrl->apst_enabled = id->apsta;
2299 }
2300 memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
2301
2302 if (ctrl->ops->flags & NVME_F_FABRICS) {
2303 ctrl->icdoff = le16_to_cpu(id->icdoff);
2304 ctrl->ioccsz = le32_to_cpu(id->ioccsz);
2305 ctrl->iorcsz = le32_to_cpu(id->iorcsz);
2306 ctrl->maxcmd = le16_to_cpu(id->maxcmd);
2307
2308 /*
2309 * In fabrics we need to verify the cntlid matches the
2310 * admin connect
2311 */
2312 if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
2313 ret = -EINVAL;
2314 goto out_free;
2315 }
2316
2317 if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
2318 dev_err(ctrl->device,
2319 "keep-alive support is mandatory for fabrics\n");
2320 ret = -EINVAL;
2321 goto out_free;
2322 }
2323 } else {
2324 ctrl->cntlid = le16_to_cpu(id->cntlid);
2325 ctrl->hmpre = le32_to_cpu(id->hmpre);
2326 ctrl->hmmin = le32_to_cpu(id->hmmin);
2327 ctrl->hmminds = le32_to_cpu(id->hmminds);
2328 ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
2329 }
2330
2331 kfree(id);
2332
2333 if (ctrl->apst_enabled && !prev_apst_enabled)
2334 dev_pm_qos_expose_latency_tolerance(ctrl->device);
2335 else if (!ctrl->apst_enabled && prev_apst_enabled)
2336 dev_pm_qos_hide_latency_tolerance(ctrl->device);
2337
2338 ret = nvme_configure_apst(ctrl);
2339 if (ret < 0)
2340 return ret;
2341
2342 ret = nvme_configure_timestamp(ctrl);
2343 if (ret < 0)
2344 return ret;
2345
2346 ret = nvme_configure_directives(ctrl);
2347 if (ret < 0)
2348 return ret;
2349
2350 ctrl->identified = true;
2351
2352 return 0;
2353
2354 out_free:
2355 kfree(id);
2356 return ret;
2357 }
2358 EXPORT_SYMBOL_GPL(nvme_init_identify);
2359
2360 static int nvme_dev_open(struct inode *inode, struct file *file)
2361 {
2362 struct nvme_ctrl *ctrl =
2363 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
2364
2365 if (ctrl->state != NVME_CTRL_LIVE)
2366 return -EWOULDBLOCK;
2367 file->private_data = ctrl;
2368 return 0;
2369 }
2370
2371 static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
2372 {
2373 struct nvme_ns *ns;
2374 int ret;
2375
2376 mutex_lock(&ctrl->namespaces_mutex);
2377 if (list_empty(&ctrl->namespaces)) {
2378 ret = -ENOTTY;
2379 goto out_unlock;
2380 }
2381
2382 ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
2383 if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
2384 dev_warn(ctrl->device,
2385 "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
2386 ret = -EINVAL;
2387 goto out_unlock;
2388 }
2389
2390 dev_warn(ctrl->device,
2391 "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
2392 kref_get(&ns->kref);
2393 mutex_unlock(&ctrl->namespaces_mutex);
2394
2395 ret = nvme_user_cmd(ctrl, ns, argp);
2396 nvme_put_ns(ns);
2397 return ret;
2398
2399 out_unlock:
2400 mutex_unlock(&ctrl->namespaces_mutex);
2401 return ret;
2402 }
2403
2404 static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
2405 unsigned long arg)
2406 {
2407 struct nvme_ctrl *ctrl = file->private_data;
2408 void __user *argp = (void __user *)arg;
2409
2410 switch (cmd) {
2411 case NVME_IOCTL_ADMIN_CMD:
2412 return nvme_user_cmd(ctrl, NULL, argp);
2413 case NVME_IOCTL_IO_CMD:
2414 return nvme_dev_user_cmd(ctrl, argp);
2415 case NVME_IOCTL_RESET:
2416 dev_warn(ctrl->device, "resetting controller\n");
2417 return nvme_reset_ctrl_sync(ctrl);
2418 case NVME_IOCTL_SUBSYS_RESET:
2419 return nvme_reset_subsystem(ctrl);
2420 case NVME_IOCTL_RESCAN:
2421 nvme_queue_scan(ctrl);
2422 return 0;
2423 default:
2424 return -ENOTTY;
2425 }
2426 }
2427
2428 static const struct file_operations nvme_dev_fops = {
2429 .owner = THIS_MODULE,
2430 .open = nvme_dev_open,
2431 .unlocked_ioctl = nvme_dev_ioctl,
2432 .compat_ioctl = nvme_dev_ioctl,
2433 };
2434
2435 static ssize_t nvme_sysfs_reset(struct device *dev,
2436 struct device_attribute *attr, const char *buf,
2437 size_t count)
2438 {
2439 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2440 int ret;
2441
2442 ret = nvme_reset_ctrl_sync(ctrl);
2443 if (ret < 0)
2444 return ret;
2445 return count;
2446 }
2447 static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
2448
2449 static ssize_t nvme_sysfs_rescan(struct device *dev,
2450 struct device_attribute *attr, const char *buf,
2451 size_t count)
2452 {
2453 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2454
2455 nvme_queue_scan(ctrl);
2456 return count;
2457 }
2458 static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
2459
2460 static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
2461 {
2462 struct gendisk *disk = dev_to_disk(dev);
2463
2464 if (disk->fops == &nvme_fops)
2465 return nvme_get_ns_from_dev(dev)->head;
2466 else
2467 return disk->private_data;
2468 }
2469
2470 static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
2471 char *buf)
2472 {
2473 struct nvme_ns_head *head = dev_to_ns_head(dev);
2474 struct nvme_ns_ids *ids = &head->ids;
2475 struct nvme_subsystem *subsys = head->subsys;
2476 int serial_len = sizeof(subsys->serial);
2477 int model_len = sizeof(subsys->model);
2478
2479 if (!uuid_is_null(&ids->uuid))
2480 return sprintf(buf, "uuid.%pU\n", &ids->uuid);
2481
2482 if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2483 return sprintf(buf, "eui.%16phN\n", ids->nguid);
2484
2485 if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2486 return sprintf(buf, "eui.%8phN\n", ids->eui64);
2487
2488 while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
2489 subsys->serial[serial_len - 1] == '\0'))
2490 serial_len--;
2491 while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
2492 subsys->model[model_len - 1] == '\0'))
2493 model_len--;
2494
2495 return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
2496 serial_len, subsys->serial, model_len, subsys->model,
2497 head->ns_id);
2498 }
2499 static DEVICE_ATTR_RO(wwid);
2500
2501 static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
2502 char *buf)
2503 {
2504 return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
2505 }
2506 static DEVICE_ATTR_RO(nguid);
2507
2508 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
2509 char *buf)
2510 {
2511 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2512
2513 /* For backward compatibility expose the NGUID to userspace if
2514 * we have no UUID set
2515 */
2516 if (uuid_is_null(&ids->uuid)) {
2517 printk_ratelimited(KERN_WARNING
2518 "No UUID available providing old NGUID\n");
2519 return sprintf(buf, "%pU\n", ids->nguid);
2520 }
2521 return sprintf(buf, "%pU\n", &ids->uuid);
2522 }
2523 static DEVICE_ATTR_RO(uuid);
2524
2525 static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
2526 char *buf)
2527 {
2528 return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
2529 }
2530 static DEVICE_ATTR_RO(eui);
2531
2532 static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
2533 char *buf)
2534 {
2535 return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
2536 }
2537 static DEVICE_ATTR_RO(nsid);
2538
2539 static struct attribute *nvme_ns_id_attrs[] = {
2540 &dev_attr_wwid.attr,
2541 &dev_attr_uuid.attr,
2542 &dev_attr_nguid.attr,
2543 &dev_attr_eui.attr,
2544 &dev_attr_nsid.attr,
2545 NULL,
2546 };
2547
2548 static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
2549 struct attribute *a, int n)
2550 {
2551 struct device *dev = container_of(kobj, struct device, kobj);
2552 struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
2553
2554 if (a == &dev_attr_uuid.attr) {
2555 if (uuid_is_null(&ids->uuid) &&
2556 !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2557 return 0;
2558 }
2559 if (a == &dev_attr_nguid.attr) {
2560 if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
2561 return 0;
2562 }
2563 if (a == &dev_attr_eui.attr) {
2564 if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
2565 return 0;
2566 }
2567 return a->mode;
2568 }
2569
2570 const struct attribute_group nvme_ns_id_attr_group = {
2571 .attrs = nvme_ns_id_attrs,
2572 .is_visible = nvme_ns_id_attrs_are_visible,
2573 };
2574
2575 #define nvme_show_str_function(field) \
2576 static ssize_t field##_show(struct device *dev, \
2577 struct device_attribute *attr, char *buf) \
2578 { \
2579 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2580 return sprintf(buf, "%.*s\n", \
2581 (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \
2582 } \
2583 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2584
2585 nvme_show_str_function(model);
2586 nvme_show_str_function(serial);
2587 nvme_show_str_function(firmware_rev);
2588
2589 #define nvme_show_int_function(field) \
2590 static ssize_t field##_show(struct device *dev, \
2591 struct device_attribute *attr, char *buf) \
2592 { \
2593 struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \
2594 return sprintf(buf, "%d\n", ctrl->field); \
2595 } \
2596 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
2597
2598 nvme_show_int_function(cntlid);
2599
2600 static ssize_t nvme_sysfs_delete(struct device *dev,
2601 struct device_attribute *attr, const char *buf,
2602 size_t count)
2603 {
2604 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2605
2606 if (device_remove_file_self(dev, attr))
2607 nvme_delete_ctrl_sync(ctrl);
2608 return count;
2609 }
2610 static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
2611
2612 static ssize_t nvme_sysfs_show_transport(struct device *dev,
2613 struct device_attribute *attr,
2614 char *buf)
2615 {
2616 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2617
2618 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
2619 }
2620 static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
2621
2622 static ssize_t nvme_sysfs_show_state(struct device *dev,
2623 struct device_attribute *attr,
2624 char *buf)
2625 {
2626 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2627 static const char *const state_name[] = {
2628 [NVME_CTRL_NEW] = "new",
2629 [NVME_CTRL_LIVE] = "live",
2630 [NVME_CTRL_RESETTING] = "resetting",
2631 [NVME_CTRL_RECONNECTING]= "reconnecting",
2632 [NVME_CTRL_DELETING] = "deleting",
2633 [NVME_CTRL_DEAD] = "dead",
2634 };
2635
2636 if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
2637 state_name[ctrl->state])
2638 return sprintf(buf, "%s\n", state_name[ctrl->state]);
2639
2640 return sprintf(buf, "unknown state\n");
2641 }
2642
2643 static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
2644
2645 static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
2646 struct device_attribute *attr,
2647 char *buf)
2648 {
2649 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2650
2651 return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
2652 }
2653 static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
2654
2655 static ssize_t nvme_sysfs_show_address(struct device *dev,
2656 struct device_attribute *attr,
2657 char *buf)
2658 {
2659 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2660
2661 return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
2662 }
2663 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
2664
2665 static struct attribute *nvme_dev_attrs[] = {
2666 &dev_attr_reset_controller.attr,
2667 &dev_attr_rescan_controller.attr,
2668 &dev_attr_model.attr,
2669 &dev_attr_serial.attr,
2670 &dev_attr_firmware_rev.attr,
2671 &dev_attr_cntlid.attr,
2672 &dev_attr_delete_controller.attr,
2673 &dev_attr_transport.attr,
2674 &dev_attr_subsysnqn.attr,
2675 &dev_attr_address.attr,
2676 &dev_attr_state.attr,
2677 NULL
2678 };
2679
2680 static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
2681 struct attribute *a, int n)
2682 {
2683 struct device *dev = container_of(kobj, struct device, kobj);
2684 struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
2685
2686 if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
2687 return 0;
2688 if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
2689 return 0;
2690
2691 return a->mode;
2692 }
2693
2694 static struct attribute_group nvme_dev_attrs_group = {
2695 .attrs = nvme_dev_attrs,
2696 .is_visible = nvme_dev_attrs_are_visible,
2697 };
2698
2699 static const struct attribute_group *nvme_dev_attr_groups[] = {
2700 &nvme_dev_attrs_group,
2701 NULL,
2702 };
2703
2704 static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
2705 unsigned nsid)
2706 {
2707 struct nvme_ns_head *h;
2708
2709 lockdep_assert_held(&subsys->lock);
2710
2711 list_for_each_entry(h, &subsys->nsheads, entry) {
2712 if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
2713 return h;
2714 }
2715
2716 return NULL;
2717 }
2718
2719 static int __nvme_check_ids(struct nvme_subsystem *subsys,
2720 struct nvme_ns_head *new)
2721 {
2722 struct nvme_ns_head *h;
2723
2724 lockdep_assert_held(&subsys->lock);
2725
2726 list_for_each_entry(h, &subsys->nsheads, entry) {
2727 if (nvme_ns_ids_valid(&new->ids) &&
2728 !list_empty(&h->list) &&
2729 nvme_ns_ids_equal(&new->ids, &h->ids))
2730 return -EINVAL;
2731 }
2732
2733 return 0;
2734 }
2735
2736 static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
2737 unsigned nsid, struct nvme_id_ns *id)
2738 {
2739 struct nvme_ns_head *head;
2740 int ret = -ENOMEM;
2741
2742 head = kzalloc(sizeof(*head), GFP_KERNEL);
2743 if (!head)
2744 goto out;
2745 ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
2746 if (ret < 0)
2747 goto out_free_head;
2748 head->instance = ret;
2749 INIT_LIST_HEAD(&head->list);
2750 init_srcu_struct(&head->srcu);
2751 head->subsys = ctrl->subsys;
2752 head->ns_id = nsid;
2753 kref_init(&head->ref);
2754
2755 nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
2756
2757 ret = __nvme_check_ids(ctrl->subsys, head);
2758 if (ret) {
2759 dev_err(ctrl->device,
2760 "duplicate IDs for nsid %d\n", nsid);
2761 goto out_cleanup_srcu;
2762 }
2763
2764 ret = nvme_mpath_alloc_disk(ctrl, head);
2765 if (ret)
2766 goto out_cleanup_srcu;
2767
2768 list_add_tail(&head->entry, &ctrl->subsys->nsheads);
2769
2770 kref_get(&ctrl->subsys->ref);
2771
2772 return head;
2773 out_cleanup_srcu:
2774 cleanup_srcu_struct(&head->srcu);
2775 ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
2776 out_free_head:
2777 kfree(head);
2778 out:
2779 return ERR_PTR(ret);
2780 }
2781
2782 static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
2783 struct nvme_id_ns *id, bool *new)
2784 {
2785 struct nvme_ctrl *ctrl = ns->ctrl;
2786 bool is_shared = id->nmic & (1 << 0);
2787 struct nvme_ns_head *head = NULL;
2788 int ret = 0;
2789
2790 mutex_lock(&ctrl->subsys->lock);
2791 if (is_shared)
2792 head = __nvme_find_ns_head(ctrl->subsys, nsid);
2793 if (!head) {
2794 head = nvme_alloc_ns_head(ctrl, nsid, id);
2795 if (IS_ERR(head)) {
2796 ret = PTR_ERR(head);
2797 goto out_unlock;
2798 }
2799
2800 *new = true;
2801 } else {
2802 struct nvme_ns_ids ids;
2803
2804 nvme_report_ns_ids(ctrl, nsid, id, &ids);
2805 if (!nvme_ns_ids_equal(&head->ids, &ids)) {
2806 dev_err(ctrl->device,
2807 "IDs don't match for shared namespace %d\n",
2808 nsid);
2809 ret = -EINVAL;
2810 goto out_unlock;
2811 }
2812
2813 *new = false;
2814 }
2815
2816 list_add_tail(&ns->siblings, &head->list);
2817 ns->head = head;
2818
2819 out_unlock:
2820 mutex_unlock(&ctrl->subsys->lock);
2821 return ret;
2822 }
2823
2824 static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
2825 {
2826 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
2827 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
2828
2829 return nsa->head->ns_id - nsb->head->ns_id;
2830 }
2831
2832 static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2833 {
2834 struct nvme_ns *ns, *ret = NULL;
2835
2836 mutex_lock(&ctrl->namespaces_mutex);
2837 list_for_each_entry(ns, &ctrl->namespaces, list) {
2838 if (ns->head->ns_id == nsid) {
2839 if (!kref_get_unless_zero(&ns->kref))
2840 continue;
2841 ret = ns;
2842 break;
2843 }
2844 if (ns->head->ns_id > nsid)
2845 break;
2846 }
2847 mutex_unlock(&ctrl->namespaces_mutex);
2848 return ret;
2849 }
2850
2851 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns)
2852 {
2853 struct streams_directive_params s;
2854 int ret;
2855
2856 if (!ctrl->nr_streams)
2857 return 0;
2858
2859 ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
2860 if (ret)
2861 return ret;
2862
2863 ns->sws = le32_to_cpu(s.sws);
2864 ns->sgs = le16_to_cpu(s.sgs);
2865
2866 if (ns->sws) {
2867 unsigned int bs = 1 << ns->lba_shift;
2868
2869 blk_queue_io_min(ns->queue, bs * ns->sws);
2870 if (ns->sgs)
2871 blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs);
2872 }
2873
2874 return 0;
2875 }
2876
2877 static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2878 {
2879 struct nvme_ns *ns;
2880 struct gendisk *disk;
2881 struct nvme_id_ns *id;
2882 char disk_name[DISK_NAME_LEN];
2883 int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
2884 bool new = true;
2885
2886 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
2887 if (!ns)
2888 return;
2889
2890 ns->queue = blk_mq_init_queue(ctrl->tagset);
2891 if (IS_ERR(ns->queue))
2892 goto out_free_ns;
2893 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
2894 ns->queue->queuedata = ns;
2895 ns->ctrl = ctrl;
2896
2897 kref_init(&ns->kref);
2898 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
2899
2900 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
2901 nvme_set_queue_limits(ctrl, ns->queue);
2902
2903 id = nvme_identify_ns(ctrl, nsid);
2904 if (!id)
2905 goto out_free_queue;
2906
2907 if (id->ncap == 0)
2908 goto out_free_id;
2909
2910 if (nvme_init_ns_head(ns, nsid, id, &new))
2911 goto out_free_id;
2912 nvme_setup_streams_ns(ctrl, ns);
2913 nvme_set_disk_name(disk_name, ns, ctrl, &flags);
2914
2915 if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
2916 if (nvme_nvm_register(ns, disk_name, node)) {
2917 dev_warn(ctrl->device, "LightNVM init failure\n");
2918 goto out_unlink_ns;
2919 }
2920 }
2921
2922 disk = alloc_disk_node(0, node);
2923 if (!disk)
2924 goto out_unlink_ns;
2925
2926 disk->fops = &nvme_fops;
2927 disk->private_data = ns;
2928 disk->queue = ns->queue;
2929 disk->flags = flags;
2930 memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
2931 ns->disk = disk;
2932
2933 __nvme_revalidate_disk(disk, id);
2934
2935 mutex_lock(&ctrl->namespaces_mutex);
2936 list_add_tail(&ns->list, &ctrl->namespaces);
2937 mutex_unlock(&ctrl->namespaces_mutex);
2938
2939 nvme_get_ctrl(ctrl);
2940
2941 kfree(id);
2942
2943 device_add_disk(ctrl->device, ns->disk);
2944 if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
2945 &nvme_ns_id_attr_group))
2946 pr_warn("%s: failed to create sysfs group for identification\n",
2947 ns->disk->disk_name);
2948 if (ns->ndev && nvme_nvm_register_sysfs(ns))
2949 pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
2950 ns->disk->disk_name);
2951
2952 if (new)
2953 nvme_mpath_add_disk(ns->head);
2954 return;
2955 out_unlink_ns:
2956 mutex_lock(&ctrl->subsys->lock);
2957 list_del_rcu(&ns->siblings);
2958 mutex_unlock(&ctrl->subsys->lock);
2959 out_free_id:
2960 kfree(id);
2961 out_free_queue:
2962 blk_cleanup_queue(ns->queue);
2963 out_free_ns:
2964 kfree(ns);
2965 }
2966
2967 static void nvme_ns_remove(struct nvme_ns *ns)
2968 {
2969 if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
2970 return;
2971
2972 if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
2973 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
2974 &nvme_ns_id_attr_group);
2975 if (ns->ndev)
2976 nvme_nvm_unregister_sysfs(ns);
2977 del_gendisk(ns->disk);
2978 blk_cleanup_queue(ns->queue);
2979 if (blk_get_integrity(ns->disk))
2980 blk_integrity_unregister(ns->disk);
2981 }
2982
2983 mutex_lock(&ns->ctrl->subsys->lock);
2984 nvme_mpath_clear_current_path(ns);
2985 list_del_rcu(&ns->siblings);
2986 mutex_unlock(&ns->ctrl->subsys->lock);
2987
2988 mutex_lock(&ns->ctrl->namespaces_mutex);
2989 list_del_init(&ns->list);
2990 mutex_unlock(&ns->ctrl->namespaces_mutex);
2991
2992 synchronize_srcu(&ns->head->srcu);
2993 nvme_mpath_check_last_path(ns);
2994 nvme_put_ns(ns);
2995 }
2996
2997 static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
2998 {
2999 struct nvme_ns *ns;
3000
3001 ns = nvme_find_get_ns(ctrl, nsid);
3002 if (ns) {
3003 if (ns->disk && revalidate_disk(ns->disk))
3004 nvme_ns_remove(ns);
3005 nvme_put_ns(ns);
3006 } else
3007 nvme_alloc_ns(ctrl, nsid);
3008 }
3009
3010 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
3011 unsigned nsid)
3012 {
3013 struct nvme_ns *ns, *next;
3014
3015 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
3016 if (ns->head->ns_id > nsid)
3017 nvme_ns_remove(ns);
3018 }
3019 }
3020
3021 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
3022 {
3023 struct nvme_ns *ns;
3024 __le32 *ns_list;
3025 unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
3026 int ret = 0;
3027
3028 ns_list = kzalloc(0x1000, GFP_KERNEL);
3029 if (!ns_list)
3030 return -ENOMEM;
3031
3032 for (i = 0; i < num_lists; i++) {
3033 ret = nvme_identify_ns_list(ctrl, prev, ns_list);
3034 if (ret)
3035 goto free;
3036
3037 for (j = 0; j < min(nn, 1024U); j++) {
3038 nsid = le32_to_cpu(ns_list[j]);
3039 if (!nsid)
3040 goto out;
3041
3042 nvme_validate_ns(ctrl, nsid);
3043
3044 while (++prev < nsid) {
3045 ns = nvme_find_get_ns(ctrl, prev);
3046 if (ns) {
3047 nvme_ns_remove(ns);
3048 nvme_put_ns(ns);
3049 }
3050 }
3051 }
3052 nn -= j;
3053 }
3054 out:
3055 nvme_remove_invalid_namespaces(ctrl, prev);
3056 free:
3057 kfree(ns_list);
3058 return ret;
3059 }
3060
3061 static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn)
3062 {
3063 unsigned i;
3064
3065 for (i = 1; i <= nn; i++)
3066 nvme_validate_ns(ctrl, i);
3067
3068 nvme_remove_invalid_namespaces(ctrl, nn);
3069 }
3070
3071 static void nvme_scan_work(struct work_struct *work)
3072 {
3073 struct nvme_ctrl *ctrl =
3074 container_of(work, struct nvme_ctrl, scan_work);
3075 struct nvme_id_ctrl *id;
3076 unsigned nn;
3077
3078 if (ctrl->state != NVME_CTRL_LIVE)
3079 return;
3080
3081 if (nvme_identify_ctrl(ctrl, &id))
3082 return;
3083
3084 nn = le32_to_cpu(id->nn);
3085 if (ctrl->vs >= NVME_VS(1, 1, 0) &&
3086 !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
3087 if (!nvme_scan_ns_list(ctrl, nn))
3088 goto done;
3089 }
3090 nvme_scan_ns_sequential(ctrl, nn);
3091 done:
3092 mutex_lock(&ctrl->namespaces_mutex);
3093 list_sort(NULL, &ctrl->namespaces, ns_cmp);
3094 mutex_unlock(&ctrl->namespaces_mutex);
3095 kfree(id);
3096 }
3097
3098 void nvme_queue_scan(struct nvme_ctrl *ctrl)
3099 {
3100 /*
3101 * Do not queue new scan work when a controller is reset during
3102 * removal.
3103 */
3104 if (ctrl->state == NVME_CTRL_LIVE)
3105 queue_work(nvme_wq, &ctrl->scan_work);
3106 }
3107 EXPORT_SYMBOL_GPL(nvme_queue_scan);
3108
3109 /*
3110 * This function iterates the namespace list unlocked to allow recovery from
3111 * controller failure. It is up to the caller to ensure the namespace list is
3112 * not modified by scan work while this function is executing.
3113 */
3114 void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
3115 {
3116 struct nvme_ns *ns, *next;
3117
3118 /*
3119 * The dead states indicates the controller was not gracefully
3120 * disconnected. In that case, we won't be able to flush any data while
3121 * removing the namespaces' disks; fail all the queues now to avoid
3122 * potentially having to clean up the failed sync later.
3123 */
3124 if (ctrl->state == NVME_CTRL_DEAD)
3125 nvme_kill_queues(ctrl);
3126
3127 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
3128 nvme_ns_remove(ns);
3129 }
3130 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
3131
3132 static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
3133 {
3134 char *envp[2] = { NULL, NULL };
3135 u32 aen_result = ctrl->aen_result;
3136
3137 ctrl->aen_result = 0;
3138 if (!aen_result)
3139 return;
3140
3141 envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
3142 if (!envp[0])
3143 return;
3144 kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
3145 kfree(envp[0]);
3146 }
3147
3148 static void nvme_async_event_work(struct work_struct *work)
3149 {
3150 struct nvme_ctrl *ctrl =
3151 container_of(work, struct nvme_ctrl, async_event_work);
3152
3153 nvme_aen_uevent(ctrl);
3154 ctrl->ops->submit_async_event(ctrl);
3155 }
3156
3157 static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
3158 {
3159
3160 u32 csts;
3161
3162 if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
3163 return false;
3164
3165 if (csts == ~0)
3166 return false;
3167
3168 return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
3169 }
3170
3171 static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
3172 {
3173 struct nvme_fw_slot_info_log *log;
3174
3175 log = kmalloc(sizeof(*log), GFP_KERNEL);
3176 if (!log)
3177 return;
3178
3179 if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
3180 dev_warn(ctrl->device,
3181 "Get FW SLOT INFO log error\n");
3182 kfree(log);
3183 }
3184
3185 static void nvme_fw_act_work(struct work_struct *work)
3186 {
3187 struct nvme_ctrl *ctrl = container_of(work,
3188 struct nvme_ctrl, fw_act_work);
3189 unsigned long fw_act_timeout;
3190
3191 if (ctrl->mtfa)
3192 fw_act_timeout = jiffies +
3193 msecs_to_jiffies(ctrl->mtfa * 100);
3194 else
3195 fw_act_timeout = jiffies +
3196 msecs_to_jiffies(admin_timeout * 1000);
3197
3198 nvme_stop_queues(ctrl);
3199 while (nvme_ctrl_pp_status(ctrl)) {
3200 if (time_after(jiffies, fw_act_timeout)) {
3201 dev_warn(ctrl->device,
3202 "Fw activation timeout, reset controller\n");
3203 nvme_reset_ctrl(ctrl);
3204 break;
3205 }
3206 msleep(100);
3207 }
3208
3209 if (ctrl->state != NVME_CTRL_LIVE)
3210 return;
3211
3212 nvme_start_queues(ctrl);
3213 /* read FW slot information to clear the AER */
3214 nvme_get_fw_slot_info(ctrl);
3215 }
3216
3217 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
3218 union nvme_result *res)
3219 {
3220 u32 result = le32_to_cpu(res->u32);
3221
3222 if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
3223 return;
3224
3225 switch (result & 0x7) {
3226 case NVME_AER_ERROR:
3227 case NVME_AER_SMART:
3228 case NVME_AER_CSS:
3229 case NVME_AER_VS:
3230 ctrl->aen_result = result;
3231 break;
3232 default:
3233 break;
3234 }
3235
3236 switch (result & 0xff07) {
3237 case NVME_AER_NOTICE_NS_CHANGED:
3238 dev_info(ctrl->device, "rescanning\n");
3239 nvme_queue_scan(ctrl);
3240 break;
3241 case NVME_AER_NOTICE_FW_ACT_STARTING:
3242 queue_work(nvme_wq, &ctrl->fw_act_work);
3243 break;
3244 default:
3245 dev_warn(ctrl->device, "async event result %08x\n", result);
3246 }
3247 queue_work(nvme_wq, &ctrl->async_event_work);
3248 }
3249 EXPORT_SYMBOL_GPL(nvme_complete_async_event);
3250
3251 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
3252 {
3253 nvme_stop_keep_alive(ctrl);
3254 flush_work(&ctrl->async_event_work);
3255 flush_work(&ctrl->scan_work);
3256 cancel_work_sync(&ctrl->fw_act_work);
3257 }
3258 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
3259
3260 void nvme_start_ctrl(struct nvme_ctrl *ctrl)
3261 {
3262 if (ctrl->kato)
3263 nvme_start_keep_alive(ctrl);
3264
3265 if (ctrl->queue_count > 1) {
3266 nvme_queue_scan(ctrl);
3267 queue_work(nvme_wq, &ctrl->async_event_work);
3268 nvme_start_queues(ctrl);
3269 }
3270 }
3271 EXPORT_SYMBOL_GPL(nvme_start_ctrl);
3272
3273 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
3274 {
3275 cdev_device_del(&ctrl->cdev, ctrl->device);
3276 }
3277 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
3278
3279 static void nvme_free_ctrl(struct device *dev)
3280 {
3281 struct nvme_ctrl *ctrl =
3282 container_of(dev, struct nvme_ctrl, ctrl_device);
3283 struct nvme_subsystem *subsys = ctrl->subsys;
3284
3285 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3286 kfree(ctrl->effects);
3287
3288 if (subsys) {
3289 mutex_lock(&subsys->lock);
3290 list_del(&ctrl->subsys_entry);
3291 mutex_unlock(&subsys->lock);
3292 sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
3293 }
3294
3295 ctrl->ops->free_ctrl(ctrl);
3296
3297 if (subsys)
3298 nvme_put_subsystem(subsys);
3299 }
3300
3301 /*
3302 * Initialize a NVMe controller structures. This needs to be called during
3303 * earliest initialization so that we have the initialized structured around
3304 * during probing.
3305 */
3306 int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
3307 const struct nvme_ctrl_ops *ops, unsigned long quirks)
3308 {
3309 int ret;
3310
3311 ctrl->state = NVME_CTRL_NEW;
3312 spin_lock_init(&ctrl->lock);
3313 INIT_LIST_HEAD(&ctrl->namespaces);
3314 mutex_init(&ctrl->namespaces_mutex);
3315 ctrl->dev = dev;
3316 ctrl->ops = ops;
3317 ctrl->quirks = quirks;
3318 INIT_WORK(&ctrl->scan_work, nvme_scan_work);
3319 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
3320 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
3321 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
3322
3323 ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
3324 if (ret < 0)
3325 goto out;
3326 ctrl->instance = ret;
3327
3328 device_initialize(&ctrl->ctrl_device);
3329 ctrl->device = &ctrl->ctrl_device;
3330 ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
3331 ctrl->device->class = nvme_class;
3332 ctrl->device->parent = ctrl->dev;
3333 ctrl->device->groups = nvme_dev_attr_groups;
3334 ctrl->device->release = nvme_free_ctrl;
3335 dev_set_drvdata(ctrl->device, ctrl);
3336 ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
3337 if (ret)
3338 goto out_release_instance;
3339
3340 cdev_init(&ctrl->cdev, &nvme_dev_fops);
3341 ctrl->cdev.owner = ops->module;
3342 ret = cdev_device_add(&ctrl->cdev, ctrl->device);
3343 if (ret)
3344 goto out_free_name;
3345
3346 /*
3347 * Initialize latency tolerance controls. The sysfs files won't
3348 * be visible to userspace unless the device actually supports APST.
3349 */
3350 ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
3351 dev_pm_qos_update_user_latency_tolerance(ctrl->device,
3352 min(default_ps_max_latency_us, (unsigned long)S32_MAX));
3353
3354 return 0;
3355 out_free_name:
3356 kfree_const(dev->kobj.name);
3357 out_release_instance:
3358 ida_simple_remove(&nvme_instance_ida, ctrl->instance);
3359 out:
3360 return ret;
3361 }
3362 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
3363
3364 /**
3365 * nvme_kill_queues(): Ends all namespace queues
3366 * @ctrl: the dead controller that needs to end
3367 *
3368 * Call this function when the driver determines it is unable to get the
3369 * controller in a state capable of servicing IO.
3370 */
3371 void nvme_kill_queues(struct nvme_ctrl *ctrl)
3372 {
3373 struct nvme_ns *ns;
3374
3375 mutex_lock(&ctrl->namespaces_mutex);
3376
3377 /* Forcibly unquiesce queues to avoid blocking dispatch */
3378 if (ctrl->admin_q)
3379 blk_mq_unquiesce_queue(ctrl->admin_q);
3380
3381 list_for_each_entry(ns, &ctrl->namespaces, list) {
3382 /*
3383 * Revalidating a dead namespace sets capacity to 0. This will
3384 * end buffered writers dirtying pages that can't be synced.
3385 */
3386 if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
3387 continue;
3388 revalidate_disk(ns->disk);
3389 blk_set_queue_dying(ns->queue);
3390
3391 /* Forcibly unquiesce queues to avoid blocking dispatch */
3392 blk_mq_unquiesce_queue(ns->queue);
3393 }
3394 mutex_unlock(&ctrl->namespaces_mutex);
3395 }
3396 EXPORT_SYMBOL_GPL(nvme_kill_queues);
3397
3398 void nvme_unfreeze(struct nvme_ctrl *ctrl)
3399 {
3400 struct nvme_ns *ns;
3401
3402 mutex_lock(&ctrl->namespaces_mutex);
3403 list_for_each_entry(ns, &ctrl->namespaces, list)
3404 blk_mq_unfreeze_queue(ns->queue);
3405 mutex_unlock(&ctrl->namespaces_mutex);
3406 }
3407 EXPORT_SYMBOL_GPL(nvme_unfreeze);
3408
3409 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
3410 {
3411 struct nvme_ns *ns;
3412
3413 mutex_lock(&ctrl->namespaces_mutex);
3414 list_for_each_entry(ns, &ctrl->namespaces, list) {
3415 timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
3416 if (timeout <= 0)
3417 break;
3418 }
3419 mutex_unlock(&ctrl->namespaces_mutex);
3420 }
3421 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
3422
3423 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
3424 {
3425 struct nvme_ns *ns;
3426
3427 mutex_lock(&ctrl->namespaces_mutex);
3428 list_for_each_entry(ns, &ctrl->namespaces, list)
3429 blk_mq_freeze_queue_wait(ns->queue);
3430 mutex_unlock(&ctrl->namespaces_mutex);
3431 }
3432 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
3433
3434 void nvme_start_freeze(struct nvme_ctrl *ctrl)
3435 {
3436 struct nvme_ns *ns;
3437
3438 mutex_lock(&ctrl->namespaces_mutex);
3439 list_for_each_entry(ns, &ctrl->namespaces, list)
3440 blk_freeze_queue_start(ns->queue);
3441 mutex_unlock(&ctrl->namespaces_mutex);
3442 }
3443 EXPORT_SYMBOL_GPL(nvme_start_freeze);
3444
3445 void nvme_stop_queues(struct nvme_ctrl *ctrl)
3446 {
3447 struct nvme_ns *ns;
3448
3449 mutex_lock(&ctrl->namespaces_mutex);
3450 list_for_each_entry(ns, &ctrl->namespaces, list)
3451 blk_mq_quiesce_queue(ns->queue);
3452 mutex_unlock(&ctrl->namespaces_mutex);
3453 }
3454 EXPORT_SYMBOL_GPL(nvme_stop_queues);
3455
3456 void nvme_start_queues(struct nvme_ctrl *ctrl)
3457 {
3458 struct nvme_ns *ns;
3459
3460 mutex_lock(&ctrl->namespaces_mutex);
3461 list_for_each_entry(ns, &ctrl->namespaces, list)
3462 blk_mq_unquiesce_queue(ns->queue);
3463 mutex_unlock(&ctrl->namespaces_mutex);
3464 }
3465 EXPORT_SYMBOL_GPL(nvme_start_queues);
3466
3467 int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set)
3468 {
3469 if (!ctrl->ops->reinit_request)
3470 return 0;
3471
3472 return blk_mq_tagset_iter(set, set->driver_data,
3473 ctrl->ops->reinit_request);
3474 }
3475 EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
3476
3477 void nvme_sync_queues(struct nvme_ctrl *ctrl)
3478 {
3479 struct nvme_ns *ns;
3480
3481 mutex_lock(&ctrl->namespaces_mutex);
3482 list_for_each_entry(ns, &ctrl->namespaces, list)
3483 blk_sync_queue(ns->queue);
3484 mutex_unlock(&ctrl->namespaces_mutex);
3485 }
3486 EXPORT_SYMBOL_GPL(nvme_sync_queues);
3487
3488 int __init nvme_core_init(void)
3489 {
3490 int result;
3491
3492 nvme_wq = alloc_workqueue("nvme-wq",
3493 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
3494 if (!nvme_wq)
3495 return -ENOMEM;
3496
3497 result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
3498 if (result < 0)
3499 goto destroy_wq;
3500
3501 nvme_class = class_create(THIS_MODULE, "nvme");
3502 if (IS_ERR(nvme_class)) {
3503 result = PTR_ERR(nvme_class);
3504 goto unregister_chrdev;
3505 }
3506
3507 nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
3508 if (IS_ERR(nvme_subsys_class)) {
3509 result = PTR_ERR(nvme_subsys_class);
3510 goto destroy_class;
3511 }
3512 return 0;
3513
3514 destroy_class:
3515 class_destroy(nvme_class);
3516 unregister_chrdev:
3517 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3518 destroy_wq:
3519 destroy_workqueue(nvme_wq);
3520 return result;
3521 }
3522
3523 void nvme_core_exit(void)
3524 {
3525 ida_destroy(&nvme_subsystems_ida);
3526 class_destroy(nvme_subsys_class);
3527 class_destroy(nvme_class);
3528 unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
3529 destroy_workqueue(nvme_wq);
3530 }
3531
3532 MODULE_LICENSE("GPL");
3533 MODULE_VERSION("1.0");
3534 module_init(nvme_core_init);
3535 module_exit(nvme_core_exit);