2 * NVM Express device driver
3 * Copyright (c) 2011, Intel Corporation.
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 #include <linux/nvme.h>
20 #include <linux/bio.h>
21 #include <linux/blkdev.h>
22 #include <linux/errno.h>
24 #include <linux/genhd.h>
25 #include <linux/init.h>
26 #include <linux/interrupt.h>
28 #include <linux/kdev_t.h>
29 #include <linux/kernel.h>
31 #include <linux/module.h>
32 #include <linux/moduleparam.h>
33 #include <linux/pci.h>
34 #include <linux/sched.h>
35 #include <linux/slab.h>
36 #include <linux/types.h>
37 #include <linux/version.h>
39 #define NVME_Q_DEPTH 1024
40 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
41 #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
42 #define NVME_MINORS 64
44 static int nvme_major
;
45 module_param(nvme_major
, int, 0);
48 * Represents an NVM Express device. Each nvme_dev is a PCI function.
51 struct list_head node
;
52 struct nvme_queue
**queues
;
54 struct pci_dev
*pci_dev
;
58 struct msix_entry
*entry
;
59 struct nvme_bar __iomem
*bar
;
60 struct list_head namespaces
;
64 * An NVM Express namespace is equivalent to a SCSI LUN
67 struct list_head list
;
70 struct request_queue
*queue
;
78 * An NVM Express queue. Each device has at least two (one for admin
79 * commands and one for I/O commands).
82 struct device
*q_dmadev
;
84 struct nvme_command
*sq_cmds
;
85 volatile struct nvme_completion
*cqes
;
86 dma_addr_t sq_dma_addr
;
87 dma_addr_t cq_dma_addr
;
88 wait_queue_head_t sq_full
;
89 struct bio_list sq_cong
;
97 unsigned long cmdid_data
[];
101 * Check we didin't inadvertently grow the command struct
103 static inline void _nvme_check_size(void)
105 BUILD_BUG_ON(sizeof(struct nvme_rw_command
) != 64);
106 BUILD_BUG_ON(sizeof(struct nvme_create_cq
) != 64);
107 BUILD_BUG_ON(sizeof(struct nvme_create_sq
) != 64);
108 BUILD_BUG_ON(sizeof(struct nvme_delete_queue
) != 64);
109 BUILD_BUG_ON(sizeof(struct nvme_features
) != 64);
110 BUILD_BUG_ON(sizeof(struct nvme_command
) != 64);
111 BUILD_BUG_ON(sizeof(struct nvme_id_ctrl
) != 4096);
112 BUILD_BUG_ON(sizeof(struct nvme_id_ns
) != 4096);
113 BUILD_BUG_ON(sizeof(struct nvme_lba_range_type
) != 64);
117 * alloc_cmdid - Allocate a Command ID
118 * @param nvmeq The queue that will be used for this command
119 * @param ctx A pointer that will be passed to the handler
120 * @param handler The ID of the handler to call
122 * Allocate a Command ID for a queue. The data passed in will
123 * be passed to the completion handler. This is implemented by using
124 * the bottom two bits of the ctx pointer to store the handler ID.
125 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
126 * We can change this if it becomes a problem.
128 static int alloc_cmdid(struct nvme_queue
*nvmeq
, void *ctx
, int handler
)
130 int depth
= nvmeq
->q_depth
;
131 unsigned long data
= (unsigned long)ctx
| handler
;
134 BUG_ON((unsigned long)ctx
& 3);
137 cmdid
= find_first_zero_bit(nvmeq
->cmdid_data
, depth
);
140 } while (test_and_set_bit(cmdid
, nvmeq
->cmdid_data
));
142 nvmeq
->cmdid_data
[cmdid
+ BITS_TO_LONGS(depth
)] = data
;
146 static int alloc_cmdid_killable(struct nvme_queue
*nvmeq
, void *ctx
,
150 wait_event_killable(nvmeq
->sq_full
,
151 (cmdid
= alloc_cmdid(nvmeq
, ctx
, handler
)) >= 0);
152 return (cmdid
< 0) ? -EINTR
: cmdid
;
155 /* If you need more than four handlers, you'll need to change how
156 * alloc_cmdid and nvme_process_cq work
159 sync_completion_id
= 0,
163 static unsigned long free_cmdid(struct nvme_queue
*nvmeq
, int cmdid
)
167 data
= nvmeq
->cmdid_data
[cmdid
+ BITS_TO_LONGS(nvmeq
->q_depth
)];
168 clear_bit(cmdid
, nvmeq
->cmdid_data
);
169 wake_up(&nvmeq
->sq_full
);
173 static struct nvme_queue
*get_nvmeq(struct nvme_ns
*ns
)
175 return ns
->dev
->queues
[1];
178 static void put_nvmeq(struct nvme_queue
*nvmeq
)
183 * nvme_submit_cmd: Copy a command into a queue and ring the doorbell
184 * @nvmeq: The queue to use
185 * @cmd: The command to send
187 * Safe to use from interrupt context
189 static int nvme_submit_cmd(struct nvme_queue
*nvmeq
, struct nvme_command
*cmd
)
193 /* XXX: Need to check tail isn't going to overrun head */
194 spin_lock_irqsave(&nvmeq
->q_lock
, flags
);
195 tail
= nvmeq
->sq_tail
;
196 memcpy(&nvmeq
->sq_cmds
[tail
], cmd
, sizeof(*cmd
));
197 writel(tail
, nvmeq
->q_db
);
198 if (++tail
== nvmeq
->q_depth
)
200 nvmeq
->sq_tail
= tail
;
201 spin_unlock_irqrestore(&nvmeq
->q_lock
, flags
);
206 struct nvme_req_info
{
209 struct scatterlist sg
[0];
212 /* XXX: use a mempool */
213 static struct nvme_req_info
*alloc_info(unsigned nseg
, gfp_t gfp
)
215 return kmalloc(sizeof(struct nvme_req_info
) +
216 sizeof(struct scatterlist
) * nseg
, gfp
);
219 static void free_info(struct nvme_req_info
*info
)
224 static void bio_completion(struct nvme_queue
*nvmeq
, void *ctx
,
225 struct nvme_completion
*cqe
)
227 struct nvme_req_info
*info
= ctx
;
228 struct bio
*bio
= info
->bio
;
229 u16 status
= le16_to_cpup(&cqe
->status
) >> 1;
231 dma_unmap_sg(nvmeq
->q_dmadev
, info
->sg
, info
->nents
,
232 bio_data_dir(bio
) ? DMA_TO_DEVICE
: DMA_FROM_DEVICE
);
234 bio_endio(bio
, status
? -EIO
: 0);
237 static int nvme_map_bio(struct device
*dev
, struct nvme_req_info
*info
,
238 struct bio
*bio
, enum dma_data_direction dma_dir
, int psegs
)
240 struct bio_vec
*bvec
;
241 struct scatterlist
*sg
= info
->sg
;
244 sg_init_table(sg
, psegs
);
245 bio_for_each_segment(bvec
, bio
, i
) {
246 sg_set_page(sg
, bvec
->bv_page
, bvec
->bv_len
, bvec
->bv_offset
);
247 /* XXX: handle non-mergable here */
252 return dma_map_sg(dev
, info
->sg
, info
->nents
, dma_dir
);
255 static int nvme_submit_bio_queue(struct nvme_queue
*nvmeq
, struct nvme_ns
*ns
,
258 struct nvme_rw_command
*cmnd
;
259 struct nvme_req_info
*info
;
260 enum dma_data_direction dma_dir
;
265 int psegs
= bio_phys_segments(ns
->queue
, bio
);
267 info
= alloc_info(psegs
, GFP_NOIO
);
272 cmdid
= alloc_cmdid(nvmeq
, info
, bio_completion_id
);
273 if (unlikely(cmdid
< 0))
277 if (bio
->bi_rw
& REQ_FUA
)
278 control
|= NVME_RW_FUA
;
279 if (bio
->bi_rw
& (REQ_FAILFAST_DEV
| REQ_RAHEAD
))
280 control
|= NVME_RW_LR
;
283 if (bio
->bi_rw
& REQ_RAHEAD
)
284 dsmgmt
|= NVME_RW_DSM_FREQ_PREFETCH
;
286 spin_lock_irqsave(&nvmeq
->q_lock
, flags
);
287 cmnd
= &nvmeq
->sq_cmds
[nvmeq
->sq_tail
].rw
;
289 if (bio_data_dir(bio
)) {
290 cmnd
->opcode
= nvme_cmd_write
;
291 dma_dir
= DMA_TO_DEVICE
;
293 cmnd
->opcode
= nvme_cmd_read
;
294 dma_dir
= DMA_FROM_DEVICE
;
297 nvme_map_bio(nvmeq
->q_dmadev
, info
, bio
, dma_dir
, psegs
);
300 cmnd
->command_id
= cmdid
;
301 cmnd
->nsid
= cpu_to_le32(ns
->ns_id
);
302 cmnd
->prp1
= cpu_to_le64(sg_phys(info
->sg
));
303 /* XXX: Support more than one PRP */
304 cmnd
->slba
= cpu_to_le64(bio
->bi_sector
>> (ns
->lba_shift
- 9));
305 cmnd
->length
= cpu_to_le16((bio
->bi_size
>> ns
->lba_shift
) - 1);
306 cmnd
->control
= cpu_to_le16(control
);
307 cmnd
->dsmgmt
= cpu_to_le32(dsmgmt
);
309 writel(nvmeq
->sq_tail
, nvmeq
->q_db
);
310 if (++nvmeq
->sq_tail
== nvmeq
->q_depth
)
313 spin_unlock_irqrestore(&nvmeq
->q_lock
, flags
);
324 * NB: return value of non-zero would mean that we were a stacking driver.
325 * make_request must always succeed.
327 static int nvme_make_request(struct request_queue
*q
, struct bio
*bio
)
329 struct nvme_ns
*ns
= q
->queuedata
;
330 struct nvme_queue
*nvmeq
= get_nvmeq(ns
);
332 if (nvme_submit_bio_queue(nvmeq
, ns
, bio
)) {
333 blk_set_queue_congested(q
, rw_is_sync(bio
->bi_rw
));
334 bio_list_add(&nvmeq
->sq_cong
, bio
);
341 struct sync_cmd_info
{
342 struct task_struct
*task
;
347 static void sync_completion(struct nvme_queue
*nvmeq
, void *ctx
,
348 struct nvme_completion
*cqe
)
350 struct sync_cmd_info
*cmdinfo
= ctx
;
351 cmdinfo
->result
= le32_to_cpup(&cqe
->result
);
352 cmdinfo
->status
= le16_to_cpup(&cqe
->status
) >> 1;
353 wake_up_process(cmdinfo
->task
);
356 typedef void (*completion_fn
)(struct nvme_queue
*, void *,
357 struct nvme_completion
*);
359 static irqreturn_t
nvme_process_cq(struct nvme_queue
*nvmeq
)
363 static const completion_fn completions
[4] = {
364 [sync_completion_id
] = sync_completion
,
365 [bio_completion_id
] = bio_completion
,
368 head
= nvmeq
->cq_head
;
369 cycle
= nvmeq
->cq_cycle
;
374 unsigned char handler
;
375 struct nvme_completion cqe
= nvmeq
->cqes
[head
];
376 if ((le16_to_cpu(cqe
.status
) & 1) != cycle
)
378 nvmeq
->sq_head
= le16_to_cpu(cqe
.sq_head
);
379 if (++head
== nvmeq
->q_depth
) {
384 data
= free_cmdid(nvmeq
, cqe
.command_id
);
386 ptr
= (void *)(data
& ~3UL);
387 completions
[handler
](nvmeq
, ptr
, &cqe
);
390 /* If the controller ignores the cq head doorbell and continuously
391 * writes to the queue, it is theoretically possible to wrap around
392 * the queue twice and mistakenly return IRQ_NONE. Linux only
393 * requires that 0.1% of your interrupts are handled, so this isn't
396 if (head
== nvmeq
->cq_head
&& cycle
== nvmeq
->cq_cycle
)
399 writel(head
, nvmeq
->q_db
+ 1);
400 nvmeq
->cq_head
= head
;
401 nvmeq
->cq_cycle
= cycle
;
406 static irqreturn_t
nvme_irq(int irq
, void *data
)
408 return nvme_process_cq(data
);
412 * Returns 0 on success. If the result is negative, it's a Linux error code;
413 * if the result is positive, it's an NVM Express status code
415 static int nvme_submit_sync_cmd(struct nvme_queue
*q
, struct nvme_command
*cmd
,
419 struct sync_cmd_info cmdinfo
;
421 cmdinfo
.task
= current
;
422 cmdinfo
.status
= -EINTR
;
424 cmdid
= alloc_cmdid_killable(q
, &cmdinfo
, sync_completion_id
);
427 cmd
->common
.command_id
= cmdid
;
429 set_current_state(TASK_UNINTERRUPTIBLE
);
430 nvme_submit_cmd(q
, cmd
);
434 *result
= cmdinfo
.result
;
436 return cmdinfo
.status
;
439 static int nvme_submit_admin_cmd(struct nvme_dev
*dev
, struct nvme_command
*cmd
,
442 return nvme_submit_sync_cmd(dev
->queues
[0], cmd
, result
);
445 static int adapter_delete_queue(struct nvme_dev
*dev
, u8 opcode
, u16 id
)
448 struct nvme_command c
;
450 memset(&c
, 0, sizeof(c
));
451 c
.delete_queue
.opcode
= opcode
;
452 c
.delete_queue
.qid
= cpu_to_le16(id
);
454 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
460 static int adapter_alloc_cq(struct nvme_dev
*dev
, u16 qid
,
461 struct nvme_queue
*nvmeq
)
464 struct nvme_command c
;
465 int flags
= NVME_QUEUE_PHYS_CONTIG
| NVME_CQ_IRQ_ENABLED
;
467 memset(&c
, 0, sizeof(c
));
468 c
.create_cq
.opcode
= nvme_admin_create_cq
;
469 c
.create_cq
.prp1
= cpu_to_le64(nvmeq
->cq_dma_addr
);
470 c
.create_cq
.cqid
= cpu_to_le16(qid
);
471 c
.create_cq
.qsize
= cpu_to_le16(nvmeq
->q_depth
- 1);
472 c
.create_cq
.cq_flags
= cpu_to_le16(flags
);
473 c
.create_cq
.irq_vector
= cpu_to_le16(nvmeq
->cq_vector
);
475 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
481 static int adapter_alloc_sq(struct nvme_dev
*dev
, u16 qid
,
482 struct nvme_queue
*nvmeq
)
485 struct nvme_command c
;
486 int flags
= NVME_QUEUE_PHYS_CONTIG
| NVME_SQ_PRIO_MEDIUM
;
488 memset(&c
, 0, sizeof(c
));
489 c
.create_sq
.opcode
= nvme_admin_create_sq
;
490 c
.create_sq
.prp1
= cpu_to_le64(nvmeq
->sq_dma_addr
);
491 c
.create_sq
.sqid
= cpu_to_le16(qid
);
492 c
.create_sq
.qsize
= cpu_to_le16(nvmeq
->q_depth
- 1);
493 c
.create_sq
.sq_flags
= cpu_to_le16(flags
);
494 c
.create_sq
.cqid
= cpu_to_le16(qid
);
496 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
502 static int adapter_delete_cq(struct nvme_dev
*dev
, u16 cqid
)
504 return adapter_delete_queue(dev
, nvme_admin_delete_cq
, cqid
);
507 static int adapter_delete_sq(struct nvme_dev
*dev
, u16 sqid
)
509 return adapter_delete_queue(dev
, nvme_admin_delete_sq
, sqid
);
512 static void nvme_free_queue(struct nvme_dev
*dev
, int qid
)
514 struct nvme_queue
*nvmeq
= dev
->queues
[qid
];
516 free_irq(dev
->entry
[nvmeq
->cq_vector
].vector
, nvmeq
);
518 /* Don't tell the adapter to delete the admin queue */
520 adapter_delete_sq(dev
, qid
);
521 adapter_delete_cq(dev
, qid
);
524 dma_free_coherent(nvmeq
->q_dmadev
, CQ_SIZE(nvmeq
->q_depth
),
525 (void *)nvmeq
->cqes
, nvmeq
->cq_dma_addr
);
526 dma_free_coherent(nvmeq
->q_dmadev
, SQ_SIZE(nvmeq
->q_depth
),
527 nvmeq
->sq_cmds
, nvmeq
->sq_dma_addr
);
531 static struct nvme_queue
*nvme_alloc_queue(struct nvme_dev
*dev
, int qid
,
532 int depth
, int vector
)
534 struct device
*dmadev
= &dev
->pci_dev
->dev
;
535 unsigned extra
= (depth
+ BITS_TO_LONGS(depth
)) * sizeof(long);
536 struct nvme_queue
*nvmeq
= kzalloc(sizeof(*nvmeq
) + extra
, GFP_KERNEL
);
540 nvmeq
->cqes
= dma_alloc_coherent(dmadev
, CQ_SIZE(depth
),
541 &nvmeq
->cq_dma_addr
, GFP_KERNEL
);
544 memset((void *)nvmeq
->cqes
, 0, CQ_SIZE(depth
));
546 nvmeq
->sq_cmds
= dma_alloc_coherent(dmadev
, SQ_SIZE(depth
),
547 &nvmeq
->sq_dma_addr
, GFP_KERNEL
);
551 nvmeq
->q_dmadev
= dmadev
;
552 spin_lock_init(&nvmeq
->q_lock
);
555 init_waitqueue_head(&nvmeq
->sq_full
);
556 bio_list_init(&nvmeq
->sq_cong
);
557 nvmeq
->q_db
= &dev
->dbs
[qid
* 2];
558 nvmeq
->q_depth
= depth
;
559 nvmeq
->cq_vector
= vector
;
564 dma_free_coherent(dmadev
, CQ_SIZE(nvmeq
->q_depth
), (void *)nvmeq
->cqes
,
571 static __devinit
struct nvme_queue
*nvme_create_queue(struct nvme_dev
*dev
,
572 int qid
, int cq_size
, int vector
)
575 struct nvme_queue
*nvmeq
= nvme_alloc_queue(dev
, qid
, cq_size
, vector
);
577 result
= adapter_alloc_cq(dev
, qid
, nvmeq
);
581 result
= adapter_alloc_sq(dev
, qid
, nvmeq
);
585 result
= request_irq(dev
->entry
[vector
].vector
, nvme_irq
,
586 IRQF_DISABLED
| IRQF_SHARED
, "nvme", nvmeq
);
593 adapter_delete_sq(dev
, qid
);
595 adapter_delete_cq(dev
, qid
);
597 dma_free_coherent(nvmeq
->q_dmadev
, CQ_SIZE(nvmeq
->q_depth
),
598 (void *)nvmeq
->cqes
, nvmeq
->cq_dma_addr
);
599 dma_free_coherent(nvmeq
->q_dmadev
, SQ_SIZE(nvmeq
->q_depth
),
600 nvmeq
->sq_cmds
, nvmeq
->sq_dma_addr
);
605 static int __devinit
nvme_configure_admin_queue(struct nvme_dev
*dev
)
609 struct nvme_queue
*nvmeq
;
611 dev
->dbs
= ((void __iomem
*)dev
->bar
) + 4096;
613 nvmeq
= nvme_alloc_queue(dev
, 0, 64, 0);
615 aqa
= nvmeq
->q_depth
- 1;
618 dev
->ctrl_config
= NVME_CC_ENABLE
| NVME_CC_CSS_NVM
;
619 dev
->ctrl_config
|= (PAGE_SHIFT
- 12) << NVME_CC_MPS_SHIFT
;
620 dev
->ctrl_config
|= NVME_CC_ARB_RR
| NVME_CC_SHN_NONE
;
622 writel(aqa
, &dev
->bar
->aqa
);
623 writeq(nvmeq
->sq_dma_addr
, &dev
->bar
->asq
);
624 writeq(nvmeq
->cq_dma_addr
, &dev
->bar
->acq
);
625 writel(dev
->ctrl_config
, &dev
->bar
->cc
);
627 while (!(readl(&dev
->bar
->csts
) & NVME_CSTS_RDY
)) {
629 if (fatal_signal_pending(current
))
633 result
= request_irq(dev
->entry
[0].vector
, nvme_irq
,
634 IRQF_DISABLED
| IRQF_SHARED
, "nvme admin", nvmeq
);
635 dev
->queues
[0] = nvmeq
;
639 static int nvme_identify(struct nvme_ns
*ns
, void __user
*addr
, int cns
)
641 struct nvme_dev
*dev
= ns
->dev
;
643 struct nvme_command c
;
647 page
= dma_alloc_coherent(&dev
->pci_dev
->dev
, 4096, &dma_addr
,
650 memset(&c
, 0, sizeof(c
));
651 c
.identify
.opcode
= nvme_admin_identify
;
652 c
.identify
.nsid
= cns
? 0 : cpu_to_le32(ns
->ns_id
);
653 c
.identify
.prp1
= cpu_to_le64(dma_addr
);
654 c
.identify
.cns
= cpu_to_le32(cns
);
656 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
660 else if (copy_to_user(addr
, page
, 4096))
663 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, page
, dma_addr
);
668 static int nvme_get_range_type(struct nvme_ns
*ns
, void __user
*addr
)
670 struct nvme_dev
*dev
= ns
->dev
;
672 struct nvme_command c
;
676 page
= dma_alloc_coherent(&dev
->pci_dev
->dev
, 4096, &dma_addr
,
679 memset(&c
, 0, sizeof(c
));
680 c
.features
.opcode
= nvme_admin_get_features
;
681 c
.features
.nsid
= cpu_to_le32(ns
->ns_id
);
682 c
.features
.prp1
= cpu_to_le64(dma_addr
);
683 c
.features
.fid
= cpu_to_le32(NVME_FEAT_LBA_RANGE
);
685 status
= nvme_submit_admin_cmd(dev
, &c
, NULL
);
687 /* XXX: Assuming first range for now */
690 else if (copy_to_user(addr
, page
, 64))
693 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, page
, dma_addr
);
698 static int nvme_ioctl(struct block_device
*bdev
, fmode_t mode
, unsigned int cmd
,
701 struct nvme_ns
*ns
= bdev
->bd_disk
->private_data
;
704 case NVME_IOCTL_IDENTIFY_NS
:
705 return nvme_identify(ns
, (void __user
*)arg
, 0);
706 case NVME_IOCTL_IDENTIFY_CTRL
:
707 return nvme_identify(ns
, (void __user
*)arg
, 1);
708 case NVME_IOCTL_GET_RANGE_TYPE
:
709 return nvme_get_range_type(ns
, (void __user
*)arg
);
715 static const struct block_device_operations nvme_fops
= {
716 .owner
= THIS_MODULE
,
720 static struct nvme_ns
*nvme_alloc_ns(struct nvme_dev
*dev
, int index
,
721 struct nvme_id_ns
*id
, struct nvme_lba_range_type
*rt
)
724 struct gendisk
*disk
;
727 if (rt
->attributes
& NVME_LBART_ATTRIB_HIDE
)
730 ns
= kzalloc(sizeof(*ns
), GFP_KERNEL
);
733 ns
->queue
= blk_alloc_queue(GFP_KERNEL
);
736 ns
->queue
->queue_flags
= QUEUE_FLAG_DEFAULT
| QUEUE_FLAG_NOMERGES
|
737 QUEUE_FLAG_NONROT
| QUEUE_FLAG_DISCARD
;
738 blk_queue_make_request(ns
->queue
, nvme_make_request
);
740 ns
->queue
->queuedata
= ns
;
742 disk
= alloc_disk(NVME_MINORS
);
747 lbaf
= id
->flbas
& 0xf;
748 ns
->lba_shift
= id
->lbaf
[lbaf
].ds
;
750 disk
->major
= nvme_major
;
751 disk
->minors
= NVME_MINORS
;
752 disk
->first_minor
= NVME_MINORS
* index
;
753 disk
->fops
= &nvme_fops
;
754 disk
->private_data
= ns
;
755 disk
->queue
= ns
->queue
;
756 sprintf(disk
->disk_name
, "nvme%dn%d", dev
->instance
, index
);
757 set_capacity(disk
, le64_to_cpup(&id
->nsze
) << (ns
->lba_shift
- 9));
762 blk_cleanup_queue(ns
->queue
);
768 static void nvme_ns_free(struct nvme_ns
*ns
)
771 blk_cleanup_queue(ns
->queue
);
775 static int set_queue_count(struct nvme_dev
*dev
, int sq_count
, int cq_count
)
779 struct nvme_command c
;
780 u32 q_count
= (sq_count
- 1) | ((cq_count
- 1) << 16);
782 memset(&c
, 0, sizeof(c
));
783 c
.features
.opcode
= nvme_admin_get_features
;
784 c
.features
.fid
= cpu_to_le32(NVME_FEAT_NUM_QUEUES
);
785 c
.features
.dword11
= cpu_to_le32(q_count
);
787 status
= nvme_submit_admin_cmd(dev
, &c
, &result
);
790 return min(result
& 0xffff, result
>> 16) + 1;
793 /* XXX: Create per-CPU queues */
794 static int __devinit
nvme_setup_io_queues(struct nvme_dev
*dev
)
798 set_queue_count(dev
, 1, 1);
800 this_cpu
= get_cpu();
801 dev
->queues
[1] = nvme_create_queue(dev
, 1, NVME_Q_DEPTH
, this_cpu
);
810 static void nvme_free_queues(struct nvme_dev
*dev
)
814 for (i
= dev
->queue_count
- 1; i
>= 0; i
--)
815 nvme_free_queue(dev
, i
);
818 static int __devinit
nvme_dev_add(struct nvme_dev
*dev
)
821 struct nvme_ns
*ns
, *next
;
824 struct nvme_command cid
, crt
;
826 res
= nvme_setup_io_queues(dev
);
830 /* XXX: Switch to a SG list once prp2 works */
831 id
= dma_alloc_coherent(&dev
->pci_dev
->dev
, 8192, &dma_addr
,
834 memset(&cid
, 0, sizeof(cid
));
835 cid
.identify
.opcode
= nvme_admin_identify
;
836 cid
.identify
.nsid
= 0;
837 cid
.identify
.prp1
= cpu_to_le64(dma_addr
);
838 cid
.identify
.cns
= cpu_to_le32(1);
840 res
= nvme_submit_admin_cmd(dev
, &cid
, NULL
);
846 nn
= le32_to_cpup(&((struct nvme_id_ctrl
*)id
)->nn
);
848 cid
.identify
.cns
= 0;
849 memset(&crt
, 0, sizeof(crt
));
850 crt
.features
.opcode
= nvme_admin_get_features
;
851 crt
.features
.prp1
= cpu_to_le64(dma_addr
+ 4096);
852 crt
.features
.fid
= cpu_to_le32(NVME_FEAT_LBA_RANGE
);
854 for (i
= 0; i
< nn
; i
++) {
855 cid
.identify
.nsid
= cpu_to_le32(i
);
856 res
= nvme_submit_admin_cmd(dev
, &cid
, NULL
);
860 if (((struct nvme_id_ns
*)id
)->ncap
== 0)
863 crt
.features
.nsid
= cpu_to_le32(i
);
864 res
= nvme_submit_admin_cmd(dev
, &crt
, NULL
);
868 ns
= nvme_alloc_ns(dev
, i
, id
, id
+ 4096);
870 list_add_tail(&ns
->list
, &dev
->namespaces
);
872 list_for_each_entry(ns
, &dev
->namespaces
, list
)
875 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, id
, dma_addr
);
879 list_for_each_entry_safe(ns
, next
, &dev
->namespaces
, list
) {
884 dma_free_coherent(&dev
->pci_dev
->dev
, 4096, id
, dma_addr
);
888 static int nvme_dev_remove(struct nvme_dev
*dev
)
890 struct nvme_ns
*ns
, *next
;
892 /* TODO: wait all I/O finished or cancel them */
894 list_for_each_entry_safe(ns
, next
, &dev
->namespaces
, list
) {
896 del_gendisk(ns
->disk
);
900 nvme_free_queues(dev
);
905 /* XXX: Use an ida or something to let remove / add work correctly */
906 static void nvme_set_instance(struct nvme_dev
*dev
)
909 dev
->instance
= instance
++;
912 static void nvme_release_instance(struct nvme_dev
*dev
)
916 static int __devinit
nvme_probe(struct pci_dev
*pdev
,
917 const struct pci_device_id
*id
)
919 int result
= -ENOMEM
;
920 struct nvme_dev
*dev
;
922 dev
= kzalloc(sizeof(*dev
), GFP_KERNEL
);
925 dev
->entry
= kcalloc(num_possible_cpus(), sizeof(*dev
->entry
),
929 dev
->queues
= kcalloc(2, sizeof(void *), GFP_KERNEL
);
933 INIT_LIST_HEAD(&dev
->namespaces
);
935 pci_set_drvdata(pdev
, dev
);
936 dma_set_mask(&dev
->pci_dev
->dev
, DMA_BIT_MASK(64));
937 nvme_set_instance(dev
);
939 dev
->bar
= ioremap(pci_resource_start(pdev
, 0), 8192);
945 result
= nvme_configure_admin_queue(dev
);
950 result
= nvme_dev_add(dev
);
956 nvme_free_queues(dev
);
960 pci_disable_msix(pdev
);
961 nvme_release_instance(dev
);
969 static void __devexit
nvme_remove(struct pci_dev
*pdev
)
971 struct nvme_dev
*dev
= pci_get_drvdata(pdev
);
972 nvme_dev_remove(dev
);
973 pci_disable_msix(pdev
);
975 nvme_release_instance(dev
);
981 /* These functions are yet to be implemented */
982 #define nvme_error_detected NULL
983 #define nvme_dump_registers NULL
984 #define nvme_link_reset NULL
985 #define nvme_slot_reset NULL
986 #define nvme_error_resume NULL
987 #define nvme_suspend NULL
988 #define nvme_resume NULL
990 static struct pci_error_handlers nvme_err_handler
= {
991 .error_detected
= nvme_error_detected
,
992 .mmio_enabled
= nvme_dump_registers
,
993 .link_reset
= nvme_link_reset
,
994 .slot_reset
= nvme_slot_reset
,
995 .resume
= nvme_error_resume
,
998 /* Move to pci_ids.h later */
999 #define PCI_CLASS_STORAGE_EXPRESS 0x010802
1001 static DEFINE_PCI_DEVICE_TABLE(nvme_id_table
) = {
1002 { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS
, 0xffffff) },
1005 MODULE_DEVICE_TABLE(pci
, nvme_id_table
);
1007 static struct pci_driver nvme_driver
= {
1009 .id_table
= nvme_id_table
,
1010 .probe
= nvme_probe
,
1011 .remove
= __devexit_p(nvme_remove
),
1012 .suspend
= nvme_suspend
,
1013 .resume
= nvme_resume
,
1014 .err_handler
= &nvme_err_handler
,
1017 static int __init
nvme_init(void)
1021 nvme_major
= register_blkdev(nvme_major
, "nvme");
1022 if (nvme_major
<= 0)
1025 result
= pci_register_driver(&nvme_driver
);
1029 unregister_blkdev(nvme_major
, "nvme");
1033 static void __exit
nvme_exit(void)
1035 pci_unregister_driver(&nvme_driver
);
1036 unregister_blkdev(nvme_major
, "nvme");
1039 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
1040 MODULE_LICENSE("GPL");
1041 MODULE_VERSION("0.1");
1042 module_init(nvme_init
);
1043 module_exit(nvme_exit
);