]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/nvme/host/core.c
nvme: move namespace scanning to common code
[mirror_ubuntu-bionic-kernel.git] / drivers / nvme / host / core.c
CommitLineData
21d34711
CH
1/*
2 * NVM Express device driver
3 * Copyright (c) 2011-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15#include <linux/blkdev.h>
16#include <linux/blk-mq.h>
5fd4ce1b 17#include <linux/delay.h>
21d34711 18#include <linux/errno.h>
1673f1f0 19#include <linux/hdreg.h>
21d34711 20#include <linux/kernel.h>
5bae7f73
CH
21#include <linux/module.h>
22#include <linux/list_sort.h>
21d34711
CH
23#include <linux/slab.h>
24#include <linux/types.h>
1673f1f0
CH
25#include <linux/pr.h>
26#include <linux/ptrace.h>
27#include <linux/nvme_ioctl.h>
28#include <linux/t10-pi.h>
29#include <scsi/sg.h>
30#include <asm/unaligned.h>
21d34711
CH
31
32#include "nvme.h"
33
5bae7f73
CH
34static int nvme_major;
35module_param(nvme_major, int, 0);
36
1673f1f0
CH
37DEFINE_SPINLOCK(dev_list_lock);
38
39static void nvme_free_ns(struct kref *kref)
40{
41 struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
42
43 if (ns->type == NVME_NS_LIGHTNVM)
44 nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
45
46 spin_lock(&dev_list_lock);
47 ns->disk->private_data = NULL;
48 spin_unlock(&dev_list_lock);
49
50 nvme_put_ctrl(ns->ctrl);
51 put_disk(ns->disk);
52 kfree(ns);
53}
54
5bae7f73 55static void nvme_put_ns(struct nvme_ns *ns)
1673f1f0
CH
56{
57 kref_put(&ns->kref, nvme_free_ns);
58}
59
60static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
61{
62 struct nvme_ns *ns;
63
64 spin_lock(&dev_list_lock);
65 ns = disk->private_data;
66 if (ns && !kref_get_unless_zero(&ns->kref))
67 ns = NULL;
68 spin_unlock(&dev_list_lock);
69
70 return ns;
71}
72
4160982e
CH
73struct request *nvme_alloc_request(struct request_queue *q,
74 struct nvme_command *cmd, unsigned int flags)
21d34711
CH
75{
76 bool write = cmd->common.opcode & 1;
21d34711 77 struct request *req;
21d34711 78
4160982e 79 req = blk_mq_alloc_request(q, write, flags);
21d34711 80 if (IS_ERR(req))
4160982e 81 return req;
21d34711
CH
82
83 req->cmd_type = REQ_TYPE_DRV_PRIV;
84 req->cmd_flags |= REQ_FAILFAST_DRIVER;
85 req->__data_len = 0;
86 req->__sector = (sector_t) -1;
87 req->bio = req->biotail = NULL;
88
21d34711
CH
89 req->cmd = (unsigned char *)cmd;
90 req->cmd_len = sizeof(struct nvme_command);
91 req->special = (void *)0;
92
4160982e
CH
93 return req;
94}
95
96/*
97 * Returns 0 on success. If the result is negative, it's a Linux error code;
98 * if the result is positive, it's an NVM Express status code
99 */
100int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
101 void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
102{
103 struct request *req;
104 int ret;
105
106 req = nvme_alloc_request(q, cmd, 0);
107 if (IS_ERR(req))
108 return PTR_ERR(req);
109
110 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
111
21d34711
CH
112 if (buffer && bufflen) {
113 ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
114 if (ret)
115 goto out;
4160982e
CH
116 }
117
118 blk_execute_rq(req->q, NULL, req, 0);
119 if (result)
120 *result = (u32)(uintptr_t)req->special;
121 ret = req->errors;
122 out:
123 blk_mq_free_request(req);
124 return ret;
125}
126
127int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
128 void *buffer, unsigned bufflen)
129{
130 return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
131}
132
0b7f1f26
KB
133int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
134 void __user *ubuffer, unsigned bufflen,
135 void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
136 u32 *result, unsigned timeout)
4160982e 137{
0b7f1f26
KB
138 bool write = cmd->common.opcode & 1;
139 struct nvme_ns *ns = q->queuedata;
140 struct gendisk *disk = ns ? ns->disk : NULL;
4160982e 141 struct request *req;
0b7f1f26
KB
142 struct bio *bio = NULL;
143 void *meta = NULL;
4160982e
CH
144 int ret;
145
146 req = nvme_alloc_request(q, cmd, 0);
147 if (IS_ERR(req))
148 return PTR_ERR(req);
149
150 req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
151
152 if (ubuffer && bufflen) {
21d34711
CH
153 ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
154 GFP_KERNEL);
155 if (ret)
156 goto out;
157 bio = req->bio;
21d34711 158
0b7f1f26
KB
159 if (!disk)
160 goto submit;
161 bio->bi_bdev = bdget_disk(disk, 0);
162 if (!bio->bi_bdev) {
163 ret = -ENODEV;
164 goto out_unmap;
165 }
166
167 if (meta_buffer) {
168 struct bio_integrity_payload *bip;
169
170 meta = kmalloc(meta_len, GFP_KERNEL);
171 if (!meta) {
172 ret = -ENOMEM;
173 goto out_unmap;
174 }
175
176 if (write) {
177 if (copy_from_user(meta, meta_buffer,
178 meta_len)) {
179 ret = -EFAULT;
180 goto out_free_meta;
181 }
182 }
183
184 bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
185 if (!bip) {
186 ret = -ENOMEM;
187 goto out_free_meta;
188 }
189
190 bip->bip_iter.bi_size = meta_len;
191 bip->bip_iter.bi_sector = meta_seed;
192
193 ret = bio_integrity_add_page(bio, virt_to_page(meta),
194 meta_len, offset_in_page(meta));
195 if (ret != meta_len) {
196 ret = -ENOMEM;
197 goto out_free_meta;
198 }
199 }
200 }
201 submit:
202 blk_execute_rq(req->q, disk, req, 0);
203 ret = req->errors;
21d34711
CH
204 if (result)
205 *result = (u32)(uintptr_t)req->special;
0b7f1f26
KB
206 if (meta && !ret && !write) {
207 if (copy_to_user(meta_buffer, meta, meta_len))
208 ret = -EFAULT;
209 }
210 out_free_meta:
211 kfree(meta);
212 out_unmap:
213 if (bio) {
214 if (disk && bio->bi_bdev)
215 bdput(bio->bi_bdev);
216 blk_rq_unmap_user(bio);
217 }
21d34711
CH
218 out:
219 blk_mq_free_request(req);
220 return ret;
221}
222
0b7f1f26
KB
223int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
224 void __user *ubuffer, unsigned bufflen, u32 *result,
225 unsigned timeout)
226{
227 return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
228 result, timeout);
229}
230
1c63dc66 231int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
21d34711
CH
232{
233 struct nvme_command c = { };
234 int error;
235
236 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
237 c.identify.opcode = nvme_admin_identify;
238 c.identify.cns = cpu_to_le32(1);
239
240 *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
241 if (!*id)
242 return -ENOMEM;
243
244 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
245 sizeof(struct nvme_id_ctrl));
246 if (error)
247 kfree(*id);
248 return error;
249}
250
1c63dc66 251int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
21d34711
CH
252 struct nvme_id_ns **id)
253{
254 struct nvme_command c = { };
255 int error;
256
257 /* gcc-4.4.4 (at least) has issues with initializers and anon unions */
258 c.identify.opcode = nvme_admin_identify,
259 c.identify.nsid = cpu_to_le32(nsid),
260
261 *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
262 if (!*id)
263 return -ENOMEM;
264
265 error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
266 sizeof(struct nvme_id_ns));
267 if (error)
268 kfree(*id);
269 return error;
270}
271
1c63dc66 272int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
21d34711
CH
273 dma_addr_t dma_addr, u32 *result)
274{
275 struct nvme_command c;
276
277 memset(&c, 0, sizeof(c));
278 c.features.opcode = nvme_admin_get_features;
279 c.features.nsid = cpu_to_le32(nsid);
280 c.features.prp1 = cpu_to_le64(dma_addr);
281 c.features.fid = cpu_to_le32(fid);
282
4160982e 283 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
21d34711
CH
284}
285
1c63dc66 286int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
21d34711
CH
287 dma_addr_t dma_addr, u32 *result)
288{
289 struct nvme_command c;
290
291 memset(&c, 0, sizeof(c));
292 c.features.opcode = nvme_admin_set_features;
293 c.features.prp1 = cpu_to_le64(dma_addr);
294 c.features.fid = cpu_to_le32(fid);
295 c.features.dword11 = cpu_to_le32(dword11);
296
4160982e 297 return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
21d34711
CH
298}
299
1c63dc66 300int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
21d34711
CH
301{
302 struct nvme_command c = { };
303 int error;
304
305 c.common.opcode = nvme_admin_get_log_page,
306 c.common.nsid = cpu_to_le32(0xFFFFFFFF),
307 c.common.cdw10[0] = cpu_to_le32(
308 (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
309 NVME_LOG_SMART),
310
311 *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
312 if (!*log)
313 return -ENOMEM;
314
315 error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
316 sizeof(struct nvme_smart_log));
317 if (error)
318 kfree(*log);
319 return error;
320}
1673f1f0
CH
321
322static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
323{
324 struct nvme_user_io io;
325 struct nvme_command c;
326 unsigned length, meta_len;
327 void __user *metadata;
328
329 if (copy_from_user(&io, uio, sizeof(io)))
330 return -EFAULT;
331
332 switch (io.opcode) {
333 case nvme_cmd_write:
334 case nvme_cmd_read:
335 case nvme_cmd_compare:
336 break;
337 default:
338 return -EINVAL;
339 }
340
341 length = (io.nblocks + 1) << ns->lba_shift;
342 meta_len = (io.nblocks + 1) * ns->ms;
343 metadata = (void __user *)(uintptr_t)io.metadata;
344
345 if (ns->ext) {
346 length += meta_len;
347 meta_len = 0;
348 } else if (meta_len) {
349 if ((io.metadata & 3) || !io.metadata)
350 return -EINVAL;
351 }
352
353 memset(&c, 0, sizeof(c));
354 c.rw.opcode = io.opcode;
355 c.rw.flags = io.flags;
356 c.rw.nsid = cpu_to_le32(ns->ns_id);
357 c.rw.slba = cpu_to_le64(io.slba);
358 c.rw.length = cpu_to_le16(io.nblocks);
359 c.rw.control = cpu_to_le16(io.control);
360 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
361 c.rw.reftag = cpu_to_le32(io.reftag);
362 c.rw.apptag = cpu_to_le16(io.apptag);
363 c.rw.appmask = cpu_to_le16(io.appmask);
364
365 return __nvme_submit_user_cmd(ns->queue, &c,
366 (void __user *)(uintptr_t)io.addr, length,
367 metadata, meta_len, io.slba, NULL, 0);
368}
369
370int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
371 struct nvme_passthru_cmd __user *ucmd)
372{
373 struct nvme_passthru_cmd cmd;
374 struct nvme_command c;
375 unsigned timeout = 0;
376 int status;
377
378 if (!capable(CAP_SYS_ADMIN))
379 return -EACCES;
380 if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
381 return -EFAULT;
382
383 memset(&c, 0, sizeof(c));
384 c.common.opcode = cmd.opcode;
385 c.common.flags = cmd.flags;
386 c.common.nsid = cpu_to_le32(cmd.nsid);
387 c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
388 c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
389 c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
390 c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
391 c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
392 c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
393 c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
394 c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
395
396 if (cmd.timeout_ms)
397 timeout = msecs_to_jiffies(cmd.timeout_ms);
398
399 status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
400 (void __user *)cmd.addr, cmd.data_len,
401 &cmd.result, timeout);
402 if (status >= 0) {
403 if (put_user(cmd.result, &ucmd->result))
404 return -EFAULT;
405 }
406
407 return status;
408}
409
410static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
411 unsigned int cmd, unsigned long arg)
412{
413 struct nvme_ns *ns = bdev->bd_disk->private_data;
414
415 switch (cmd) {
416 case NVME_IOCTL_ID:
417 force_successful_syscall_return();
418 return ns->ns_id;
419 case NVME_IOCTL_ADMIN_CMD:
420 return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
421 case NVME_IOCTL_IO_CMD:
422 return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
423 case NVME_IOCTL_SUBMIT_IO:
424 return nvme_submit_io(ns, (void __user *)arg);
425 case SG_GET_VERSION_NUM:
426 return nvme_sg_get_version_num((void __user *)arg);
427 case SG_IO:
428 return nvme_sg_io(ns, (void __user *)arg);
429 default:
430 return -ENOTTY;
431 }
432}
433
434#ifdef CONFIG_COMPAT
435static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
436 unsigned int cmd, unsigned long arg)
437{
438 switch (cmd) {
439 case SG_IO:
440 return -ENOIOCTLCMD;
441 }
442 return nvme_ioctl(bdev, mode, cmd, arg);
443}
444#else
445#define nvme_compat_ioctl NULL
446#endif
447
448static int nvme_open(struct block_device *bdev, fmode_t mode)
449{
450 return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
451}
452
453static void nvme_release(struct gendisk *disk, fmode_t mode)
454{
455 nvme_put_ns(disk->private_data);
456}
457
458static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
459{
460 /* some standard values */
461 geo->heads = 1 << 6;
462 geo->sectors = 1 << 5;
463 geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
464 return 0;
465}
466
467#ifdef CONFIG_BLK_DEV_INTEGRITY
468static void nvme_init_integrity(struct nvme_ns *ns)
469{
470 struct blk_integrity integrity;
471
472 switch (ns->pi_type) {
473 case NVME_NS_DPS_PI_TYPE3:
474 integrity.profile = &t10_pi_type3_crc;
475 break;
476 case NVME_NS_DPS_PI_TYPE1:
477 case NVME_NS_DPS_PI_TYPE2:
478 integrity.profile = &t10_pi_type1_crc;
479 break;
480 default:
481 integrity.profile = NULL;
482 break;
483 }
484 integrity.tuple_size = ns->ms;
485 blk_integrity_register(ns->disk, &integrity);
486 blk_queue_max_integrity_segments(ns->queue, 1);
487}
488#else
489static void nvme_init_integrity(struct nvme_ns *ns)
490{
491}
492#endif /* CONFIG_BLK_DEV_INTEGRITY */
493
494static void nvme_config_discard(struct nvme_ns *ns)
495{
496 u32 logical_block_size = queue_logical_block_size(ns->queue);
497 ns->queue->limits.discard_zeroes_data = 0;
498 ns->queue->limits.discard_alignment = logical_block_size;
499 ns->queue->limits.discard_granularity = logical_block_size;
500 blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
501 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
502}
503
5bae7f73 504static int nvme_revalidate_disk(struct gendisk *disk)
1673f1f0
CH
505{
506 struct nvme_ns *ns = disk->private_data;
507 struct nvme_id_ns *id;
508 u8 lbaf, pi_type;
509 u16 old_ms;
510 unsigned short bs;
511
512 if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
513 dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
514 __func__, ns->ctrl->instance, ns->ns_id);
515 return -ENODEV;
516 }
517 if (id->ncap == 0) {
518 kfree(id);
519 return -ENODEV;
520 }
521
522 if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
523 if (nvme_nvm_register(ns->queue, disk->disk_name)) {
524 dev_warn(ns->ctrl->dev,
525 "%s: LightNVM init failure\n", __func__);
526 kfree(id);
527 return -ENODEV;
528 }
529 ns->type = NVME_NS_LIGHTNVM;
530 }
531
532 old_ms = ns->ms;
533 lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
534 ns->lba_shift = id->lbaf[lbaf].ds;
535 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
536 ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
537
538 /*
539 * If identify namespace failed, use default 512 byte block size so
540 * block layer can use before failing read/write for 0 capacity.
541 */
542 if (ns->lba_shift == 0)
543 ns->lba_shift = 9;
544 bs = 1 << ns->lba_shift;
545
546 /* XXX: PI implementation requires metadata equal t10 pi tuple size */
547 pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
548 id->dps & NVME_NS_DPS_PI_MASK : 0;
549
550 blk_mq_freeze_queue(disk->queue);
551 if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
552 ns->ms != old_ms ||
553 bs != queue_logical_block_size(disk->queue) ||
554 (ns->ms && ns->ext)))
555 blk_integrity_unregister(disk);
556
557 ns->pi_type = pi_type;
558 blk_queue_logical_block_size(ns->queue, bs);
559
560 if (ns->ms && !ns->ext)
561 nvme_init_integrity(ns);
562
563 if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
564 set_capacity(disk, 0);
565 else
566 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
567
568 if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
569 nvme_config_discard(ns);
570 blk_mq_unfreeze_queue(disk->queue);
571
572 kfree(id);
573 return 0;
574}
575
576static char nvme_pr_type(enum pr_type type)
577{
578 switch (type) {
579 case PR_WRITE_EXCLUSIVE:
580 return 1;
581 case PR_EXCLUSIVE_ACCESS:
582 return 2;
583 case PR_WRITE_EXCLUSIVE_REG_ONLY:
584 return 3;
585 case PR_EXCLUSIVE_ACCESS_REG_ONLY:
586 return 4;
587 case PR_WRITE_EXCLUSIVE_ALL_REGS:
588 return 5;
589 case PR_EXCLUSIVE_ACCESS_ALL_REGS:
590 return 6;
591 default:
592 return 0;
593 }
594};
595
596static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
597 u64 key, u64 sa_key, u8 op)
598{
599 struct nvme_ns *ns = bdev->bd_disk->private_data;
600 struct nvme_command c;
601 u8 data[16] = { 0, };
602
603 put_unaligned_le64(key, &data[0]);
604 put_unaligned_le64(sa_key, &data[8]);
605
606 memset(&c, 0, sizeof(c));
607 c.common.opcode = op;
608 c.common.nsid = cpu_to_le32(ns->ns_id);
609 c.common.cdw10[0] = cpu_to_le32(cdw10);
610
611 return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
612}
613
614static int nvme_pr_register(struct block_device *bdev, u64 old,
615 u64 new, unsigned flags)
616{
617 u32 cdw10;
618
619 if (flags & ~PR_FL_IGNORE_KEY)
620 return -EOPNOTSUPP;
621
622 cdw10 = old ? 2 : 0;
623 cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
624 cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
625 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
626}
627
628static int nvme_pr_reserve(struct block_device *bdev, u64 key,
629 enum pr_type type, unsigned flags)
630{
631 u32 cdw10;
632
633 if (flags & ~PR_FL_IGNORE_KEY)
634 return -EOPNOTSUPP;
635
636 cdw10 = nvme_pr_type(type) << 8;
637 cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
638 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
639}
640
641static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
642 enum pr_type type, bool abort)
643{
644 u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
645 return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
646}
647
648static int nvme_pr_clear(struct block_device *bdev, u64 key)
649{
650 u32 cdw10 = 1 | key ? 1 << 3 : 0;
651 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
652}
653
654static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
655{
656 u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
657 return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
658}
659
660static const struct pr_ops nvme_pr_ops = {
661 .pr_register = nvme_pr_register,
662 .pr_reserve = nvme_pr_reserve,
663 .pr_release = nvme_pr_release,
664 .pr_preempt = nvme_pr_preempt,
665 .pr_clear = nvme_pr_clear,
666};
667
5bae7f73 668static const struct block_device_operations nvme_fops = {
1673f1f0
CH
669 .owner = THIS_MODULE,
670 .ioctl = nvme_ioctl,
671 .compat_ioctl = nvme_compat_ioctl,
672 .open = nvme_open,
673 .release = nvme_release,
674 .getgeo = nvme_getgeo,
675 .revalidate_disk= nvme_revalidate_disk,
676 .pr_ops = &nvme_pr_ops,
677};
678
5fd4ce1b
CH
679static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
680{
681 unsigned long timeout =
682 ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
683 u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
684 int ret;
685
686 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
687 if ((csts & NVME_CSTS_RDY) == bit)
688 break;
689
690 msleep(100);
691 if (fatal_signal_pending(current))
692 return -EINTR;
693 if (time_after(jiffies, timeout)) {
694 dev_err(ctrl->dev,
695 "Device not ready; aborting %s\n", enabled ?
696 "initialisation" : "reset");
697 return -ENODEV;
698 }
699 }
700
701 return ret;
702}
703
704/*
705 * If the device has been passed off to us in an enabled state, just clear
706 * the enabled bit. The spec says we should set the 'shutdown notification
707 * bits', but doing so may cause the device to complete commands to the
708 * admin queue ... and we don't know what memory that might be pointing at!
709 */
710int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
711{
712 int ret;
713
714 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
715 ctrl->ctrl_config &= ~NVME_CC_ENABLE;
716
717 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
718 if (ret)
719 return ret;
720 return nvme_wait_ready(ctrl, cap, false);
721}
722
723int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
724{
725 /*
726 * Default to a 4K page size, with the intention to update this
727 * path in the future to accomodate architectures with differing
728 * kernel and IO page sizes.
729 */
730 unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
731 int ret;
732
733 if (page_shift < dev_page_min) {
734 dev_err(ctrl->dev,
735 "Minimum device page size %u too large for host (%u)\n",
736 1 << dev_page_min, 1 << page_shift);
737 return -ENODEV;
738 }
739
740 ctrl->page_size = 1 << page_shift;
741
742 ctrl->ctrl_config = NVME_CC_CSS_NVM;
743 ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
744 ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
745 ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
746 ctrl->ctrl_config |= NVME_CC_ENABLE;
747
748 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
749 if (ret)
750 return ret;
751 return nvme_wait_ready(ctrl, cap, true);
752}
753
754int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
755{
756 unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies;
757 u32 csts;
758 int ret;
759
760 ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
761 ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
762
763 ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
764 if (ret)
765 return ret;
766
767 while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
768 if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
769 break;
770
771 msleep(100);
772 if (fatal_signal_pending(current))
773 return -EINTR;
774 if (time_after(jiffies, timeout)) {
775 dev_err(ctrl->dev,
776 "Device shutdown incomplete; abort shutdown\n");
777 return -ENODEV;
778 }
779 }
780
781 return ret;
782}
783
7fd8930f
CH
784/*
785 * Initialize the cached copies of the Identify data and various controller
786 * register in our nvme_ctrl structure. This should be called as soon as
787 * the admin queue is fully up and running.
788 */
789int nvme_init_identify(struct nvme_ctrl *ctrl)
790{
791 struct nvme_id_ctrl *id;
792 u64 cap;
793 int ret, page_shift;
794
795 ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
796 if (ret) {
797 dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
798 return ret;
799 }
800 page_shift = NVME_CAP_MPSMIN(cap) + 12;
801
802 ret = nvme_identify_ctrl(ctrl, &id);
803 if (ret) {
804 dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
805 return -EIO;
806 }
807
808 ctrl->oncs = le16_to_cpup(&id->oncs);
809 ctrl->abort_limit = id->acl + 1;
810 ctrl->vwc = id->vwc;
811 memcpy(ctrl->serial, id->sn, sizeof(id->sn));
812 memcpy(ctrl->model, id->mn, sizeof(id->mn));
813 memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
814 if (id->mdts)
815 ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
816 else
817 ctrl->max_hw_sectors = UINT_MAX;
818
819 if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
820 unsigned int max_hw_sectors;
821
822 ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
823 max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
824 if (ctrl->max_hw_sectors) {
825 ctrl->max_hw_sectors = min(max_hw_sectors,
826 ctrl->max_hw_sectors);
827 } else {
828 ctrl->max_hw_sectors = max_hw_sectors;
829 }
830 }
831
832 kfree(id);
833 return 0;
834}
835
1673f1f0
CH
836static void nvme_free_ctrl(struct kref *kref)
837{
838 struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
839
840 ctrl->ops->free_ctrl(ctrl);
841}
842
843void nvme_put_ctrl(struct nvme_ctrl *ctrl)
844{
845 kref_put(&ctrl->kref, nvme_free_ctrl);
846}
847
5bae7f73
CH
848static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
849{
850 struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
851 struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
852
853 return nsa->ns_id - nsb->ns_id;
854}
855
856static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
857{
858 struct nvme_ns *ns;
859
860 list_for_each_entry(ns, &ctrl->namespaces, list) {
861 if (ns->ns_id == nsid)
862 return ns;
863 if (ns->ns_id > nsid)
864 break;
865 }
866 return NULL;
867}
868
869static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
870{
871 struct nvme_ns *ns;
872 struct gendisk *disk;
873 int node = dev_to_node(ctrl->dev);
874
875 ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
876 if (!ns)
877 return;
878
879 ns->queue = blk_mq_init_queue(ctrl->tagset);
880 if (IS_ERR(ns->queue))
881 goto out_free_ns;
882 queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
883 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
884 ns->queue->queuedata = ns;
885 ns->ctrl = ctrl;
886
887 disk = alloc_disk_node(0, node);
888 if (!disk)
889 goto out_free_queue;
890
891 kref_init(&ns->kref);
892 ns->ns_id = nsid;
893 ns->disk = disk;
894 ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
895 list_add_tail(&ns->list, &ctrl->namespaces);
896
897 blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
898 if (ctrl->max_hw_sectors) {
899 blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
900 blk_queue_max_segments(ns->queue,
901 (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
902 }
903 if (ctrl->stripe_size)
904 blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
905 if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
906 blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
907 blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
908
909 disk->major = nvme_major;
910 disk->first_minor = 0;
911 disk->fops = &nvme_fops;
912 disk->private_data = ns;
913 disk->queue = ns->queue;
914 disk->driverfs_dev = ctrl->device;
915 disk->flags = GENHD_FL_EXT_DEVT;
916 sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
917
918 /*
919 * Initialize capacity to 0 until we establish the namespace format and
920 * setup integrity extentions if necessary. The revalidate_disk after
921 * add_disk allows the driver to register with integrity if the format
922 * requires it.
923 */
924 set_capacity(disk, 0);
925 if (nvme_revalidate_disk(ns->disk))
926 goto out_free_disk;
927
928 kref_get(&ctrl->kref);
929 if (ns->type != NVME_NS_LIGHTNVM) {
930 add_disk(ns->disk);
931 if (ns->ms) {
932 struct block_device *bd = bdget_disk(ns->disk, 0);
933 if (!bd)
934 return;
935 if (blkdev_get(bd, FMODE_READ, NULL)) {
936 bdput(bd);
937 return;
938 }
939 blkdev_reread_part(bd);
940 blkdev_put(bd, FMODE_READ);
941 }
942 }
943
944 return;
945 out_free_disk:
946 kfree(disk);
947 list_del(&ns->list);
948 out_free_queue:
949 blk_cleanup_queue(ns->queue);
950 out_free_ns:
951 kfree(ns);
952}
953
954static void nvme_ns_remove(struct nvme_ns *ns)
955{
956 bool kill = nvme_io_incapable(ns->ctrl) &&
957 !blk_queue_dying(ns->queue);
958
959 if (kill)
960 blk_set_queue_dying(ns->queue);
961 if (ns->disk->flags & GENHD_FL_UP) {
962 if (blk_get_integrity(ns->disk))
963 blk_integrity_unregister(ns->disk);
964 del_gendisk(ns->disk);
965 }
966 if (kill || !blk_queue_dying(ns->queue)) {
967 blk_mq_abort_requeue_list(ns->queue);
968 blk_cleanup_queue(ns->queue);
969 }
970 list_del_init(&ns->list);
971 nvme_put_ns(ns);
972}
973
974static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
975{
976 struct nvme_ns *ns, *next;
977 unsigned i;
978
979 for (i = 1; i <= nn; i++) {
980 ns = nvme_find_ns(ctrl, i);
981 if (ns) {
982 if (revalidate_disk(ns->disk))
983 nvme_ns_remove(ns);
984 } else
985 nvme_alloc_ns(ctrl, i);
986 }
987 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
988 if (ns->ns_id > nn)
989 nvme_ns_remove(ns);
990 }
991 list_sort(NULL, &ctrl->namespaces, ns_cmp);
992}
993
994void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
995{
996 struct nvme_id_ctrl *id;
997
998 if (nvme_identify_ctrl(ctrl, &id))
999 return;
1000 __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
1001 kfree(id);
1002}
1003
1004void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
1005{
1006 struct nvme_ns *ns, *next;
1007
1008 list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
1009 nvme_ns_remove(ns);
1010}
1011
1012int __init nvme_core_init(void)
1013{
1014 int result;
1015
1016 result = register_blkdev(nvme_major, "nvme");
1017 if (result < 0)
1018 return result;
1019 else if (result > 0)
1020 nvme_major = result;
1021
1022 return 0;
1023}
1024
1025void nvme_core_exit(void)
1026{
1027 unregister_blkdev(nvme_major, "nvme");
1028}