1 // SPDX-License-Identifier: GPL-2.0
3 * NVMe over Fabrics TCP host.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/module.h>
8 #include <linux/init.h>
9 #include <linux/slab.h>
10 #include <linux/err.h>
11 #include <linux/nvme-tcp.h>
14 #include <linux/blk-mq.h>
15 #include <crypto/hash.h>
20 struct nvme_tcp_queue
;
22 enum nvme_tcp_send_state
{
23 NVME_TCP_SEND_CMD_PDU
= 0,
24 NVME_TCP_SEND_H2C_PDU
,
29 struct nvme_tcp_request
{
30 struct nvme_request req
;
32 struct nvme_tcp_queue
*queue
;
37 struct list_head entry
;
46 enum nvme_tcp_send_state state
;
49 enum nvme_tcp_queue_flags
{
50 NVME_TCP_Q_ALLOCATED
= 0,
54 enum nvme_tcp_recv_state
{
55 NVME_TCP_RECV_PDU
= 0,
61 struct nvme_tcp_queue
{
63 struct work_struct io_work
;
67 struct list_head send_list
;
73 size_t data_remaining
;
74 size_t ddgst_remaining
;
77 struct nvme_tcp_request
*request
;
80 size_t cmnd_capsule_len
;
81 struct nvme_tcp_ctrl
*ctrl
;
87 struct ahash_request
*rcv_hash
;
88 struct ahash_request
*snd_hash
;
92 struct page_frag_cache pf_cache
;
94 void (*state_change
)(struct sock
*);
95 void (*data_ready
)(struct sock
*);
96 void (*write_space
)(struct sock
*);
99 struct nvme_tcp_ctrl
{
100 /* read only in the hot path */
101 struct nvme_tcp_queue
*queues
;
102 struct blk_mq_tag_set tag_set
;
104 /* other member variables */
105 struct list_head list
;
106 struct blk_mq_tag_set admin_tag_set
;
107 struct sockaddr_storage addr
;
108 struct sockaddr_storage src_addr
;
109 struct nvme_ctrl ctrl
;
111 struct work_struct err_work
;
112 struct delayed_work connect_work
;
113 struct nvme_tcp_request async_req
;
116 static LIST_HEAD(nvme_tcp_ctrl_list
);
117 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex
);
118 static struct workqueue_struct
*nvme_tcp_wq
;
119 static struct blk_mq_ops nvme_tcp_mq_ops
;
120 static struct blk_mq_ops nvme_tcp_admin_mq_ops
;
122 static inline struct nvme_tcp_ctrl
*to_tcp_ctrl(struct nvme_ctrl
*ctrl
)
124 return container_of(ctrl
, struct nvme_tcp_ctrl
, ctrl
);
127 static inline int nvme_tcp_queue_id(struct nvme_tcp_queue
*queue
)
129 return queue
- queue
->ctrl
->queues
;
132 static inline struct blk_mq_tags
*nvme_tcp_tagset(struct nvme_tcp_queue
*queue
)
134 u32 queue_idx
= nvme_tcp_queue_id(queue
);
137 return queue
->ctrl
->admin_tag_set
.tags
[queue_idx
];
138 return queue
->ctrl
->tag_set
.tags
[queue_idx
- 1];
141 static inline u8
nvme_tcp_hdgst_len(struct nvme_tcp_queue
*queue
)
143 return queue
->hdr_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
146 static inline u8
nvme_tcp_ddgst_len(struct nvme_tcp_queue
*queue
)
148 return queue
->data_digest
? NVME_TCP_DIGEST_LENGTH
: 0;
151 static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue
*queue
)
153 return queue
->cmnd_capsule_len
- sizeof(struct nvme_command
);
156 static inline bool nvme_tcp_async_req(struct nvme_tcp_request
*req
)
158 return req
== &req
->queue
->ctrl
->async_req
;
161 static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request
*req
)
166 if (unlikely(nvme_tcp_async_req(req
)))
167 return false; /* async events don't have a request */
169 rq
= blk_mq_rq_from_pdu(req
);
170 bytes
= blk_rq_payload_bytes(rq
);
172 return rq_data_dir(rq
) == WRITE
&& bytes
&&
173 bytes
<= nvme_tcp_inline_data_size(req
->queue
);
176 static inline struct page
*nvme_tcp_req_cur_page(struct nvme_tcp_request
*req
)
178 return req
->iter
.bvec
->bv_page
;
181 static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request
*req
)
183 return req
->iter
.bvec
->bv_offset
+ req
->iter
.iov_offset
;
186 static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request
*req
)
188 return min_t(size_t, req
->iter
.bvec
->bv_len
- req
->iter
.iov_offset
,
189 req
->pdu_len
- req
->pdu_sent
);
192 static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request
*req
)
194 return req
->iter
.iov_offset
;
197 static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request
*req
)
199 return rq_data_dir(blk_mq_rq_from_pdu(req
)) == WRITE
?
200 req
->pdu_len
- req
->pdu_sent
: 0;
203 static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request
*req
,
206 return nvme_tcp_pdu_data_left(req
) <= len
;
209 static void nvme_tcp_init_iter(struct nvme_tcp_request
*req
,
212 struct request
*rq
= blk_mq_rq_from_pdu(req
);
218 if (rq
->rq_flags
& RQF_SPECIAL_PAYLOAD
) {
219 vec
= &rq
->special_vec
;
221 size
= blk_rq_payload_bytes(rq
);
224 struct bio
*bio
= req
->curr_bio
;
226 vec
= __bvec_iter_bvec(bio
->bi_io_vec
, bio
->bi_iter
);
227 nsegs
= bio_segments(bio
);
228 size
= bio
->bi_iter
.bi_size
;
229 offset
= bio
->bi_iter
.bi_bvec_done
;
232 iov_iter_bvec(&req
->iter
, dir
, vec
, nsegs
, size
);
233 req
->iter
.iov_offset
= offset
;
236 static inline void nvme_tcp_advance_req(struct nvme_tcp_request
*req
,
239 req
->data_sent
+= len
;
240 req
->pdu_sent
+= len
;
241 iov_iter_advance(&req
->iter
, len
);
242 if (!iov_iter_count(&req
->iter
) &&
243 req
->data_sent
< req
->data_len
) {
244 req
->curr_bio
= req
->curr_bio
->bi_next
;
245 nvme_tcp_init_iter(req
, WRITE
);
249 static inline void nvme_tcp_queue_request(struct nvme_tcp_request
*req
)
251 struct nvme_tcp_queue
*queue
= req
->queue
;
253 spin_lock(&queue
->lock
);
254 list_add_tail(&req
->entry
, &queue
->send_list
);
255 spin_unlock(&queue
->lock
);
257 queue_work_on(queue
->io_cpu
, nvme_tcp_wq
, &queue
->io_work
);
260 static inline struct nvme_tcp_request
*
261 nvme_tcp_fetch_request(struct nvme_tcp_queue
*queue
)
263 struct nvme_tcp_request
*req
;
265 spin_lock(&queue
->lock
);
266 req
= list_first_entry_or_null(&queue
->send_list
,
267 struct nvme_tcp_request
, entry
);
269 list_del(&req
->entry
);
270 spin_unlock(&queue
->lock
);
275 static inline void nvme_tcp_ddgst_final(struct ahash_request
*hash
,
278 ahash_request_set_crypt(hash
, NULL
, (u8
*)dgst
, 0);
279 crypto_ahash_final(hash
);
282 static inline void nvme_tcp_ddgst_update(struct ahash_request
*hash
,
283 struct page
*page
, off_t off
, size_t len
)
285 struct scatterlist sg
;
287 sg_init_marker(&sg
, 1);
288 sg_set_page(&sg
, page
, len
, off
);
289 ahash_request_set_crypt(hash
, &sg
, NULL
, len
);
290 crypto_ahash_update(hash
);
293 static inline void nvme_tcp_hdgst(struct ahash_request
*hash
,
294 void *pdu
, size_t len
)
296 struct scatterlist sg
;
298 sg_init_one(&sg
, pdu
, len
);
299 ahash_request_set_crypt(hash
, &sg
, pdu
+ len
, len
);
300 crypto_ahash_digest(hash
);
303 static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue
*queue
,
304 void *pdu
, size_t pdu_len
)
306 struct nvme_tcp_hdr
*hdr
= pdu
;
310 if (unlikely(!(hdr
->flags
& NVME_TCP_F_HDGST
))) {
311 dev_err(queue
->ctrl
->ctrl
.device
,
312 "queue %d: header digest flag is cleared\n",
313 nvme_tcp_queue_id(queue
));
317 recv_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
318 nvme_tcp_hdgst(queue
->rcv_hash
, pdu
, pdu_len
);
319 exp_digest
= *(__le32
*)(pdu
+ hdr
->hlen
);
320 if (recv_digest
!= exp_digest
) {
321 dev_err(queue
->ctrl
->ctrl
.device
,
322 "header digest error: recv %#x expected %#x\n",
323 le32_to_cpu(recv_digest
), le32_to_cpu(exp_digest
));
330 static int nvme_tcp_check_ddgst(struct nvme_tcp_queue
*queue
, void *pdu
)
332 struct nvme_tcp_hdr
*hdr
= pdu
;
333 u8 digest_len
= nvme_tcp_hdgst_len(queue
);
336 len
= le32_to_cpu(hdr
->plen
) - hdr
->hlen
-
337 ((hdr
->flags
& NVME_TCP_F_HDGST
) ? digest_len
: 0);
339 if (unlikely(len
&& !(hdr
->flags
& NVME_TCP_F_DDGST
))) {
340 dev_err(queue
->ctrl
->ctrl
.device
,
341 "queue %d: data digest flag is cleared\n",
342 nvme_tcp_queue_id(queue
));
345 crypto_ahash_init(queue
->rcv_hash
);
350 static void nvme_tcp_exit_request(struct blk_mq_tag_set
*set
,
351 struct request
*rq
, unsigned int hctx_idx
)
353 struct nvme_tcp_request
*req
= blk_mq_rq_to_pdu(rq
);
355 page_frag_free(req
->pdu
);
358 static int nvme_tcp_init_request(struct blk_mq_tag_set
*set
,
359 struct request
*rq
, unsigned int hctx_idx
,
360 unsigned int numa_node
)
362 struct nvme_tcp_ctrl
*ctrl
= set
->driver_data
;
363 struct nvme_tcp_request
*req
= blk_mq_rq_to_pdu(rq
);
364 int queue_idx
= (set
== &ctrl
->tag_set
) ? hctx_idx
+ 1 : 0;
365 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[queue_idx
];
366 u8 hdgst
= nvme_tcp_hdgst_len(queue
);
368 req
->pdu
= page_frag_alloc(&queue
->pf_cache
,
369 sizeof(struct nvme_tcp_cmd_pdu
) + hdgst
,
370 GFP_KERNEL
| __GFP_ZERO
);
375 nvme_req(rq
)->ctrl
= &ctrl
->ctrl
;
380 static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx
*hctx
, void *data
,
381 unsigned int hctx_idx
)
383 struct nvme_tcp_ctrl
*ctrl
= data
;
384 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[hctx_idx
+ 1];
386 hctx
->driver_data
= queue
;
390 static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx
*hctx
, void *data
,
391 unsigned int hctx_idx
)
393 struct nvme_tcp_ctrl
*ctrl
= data
;
394 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[0];
396 hctx
->driver_data
= queue
;
400 static enum nvme_tcp_recv_state
401 nvme_tcp_recv_state(struct nvme_tcp_queue
*queue
)
403 return (queue
->pdu_remaining
) ? NVME_TCP_RECV_PDU
:
404 (queue
->ddgst_remaining
) ? NVME_TCP_RECV_DDGST
:
408 static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue
*queue
)
410 queue
->pdu_remaining
= sizeof(struct nvme_tcp_rsp_pdu
) +
411 nvme_tcp_hdgst_len(queue
);
412 queue
->pdu_offset
= 0;
413 queue
->data_remaining
= -1;
414 queue
->ddgst_remaining
= 0;
417 static void nvme_tcp_error_recovery(struct nvme_ctrl
*ctrl
)
419 if (!nvme_change_ctrl_state(ctrl
, NVME_CTRL_RESETTING
))
422 queue_work(nvme_wq
, &to_tcp_ctrl(ctrl
)->err_work
);
425 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue
*queue
,
426 struct nvme_completion
*cqe
)
430 rq
= blk_mq_tag_to_rq(nvme_tcp_tagset(queue
), cqe
->command_id
);
432 dev_err(queue
->ctrl
->ctrl
.device
,
433 "queue %d tag 0x%x not found\n",
434 nvme_tcp_queue_id(queue
), cqe
->command_id
);
435 nvme_tcp_error_recovery(&queue
->ctrl
->ctrl
);
439 nvme_end_request(rq
, cqe
->status
, cqe
->result
);
444 static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue
*queue
,
445 struct nvme_tcp_data_pdu
*pdu
)
449 rq
= blk_mq_tag_to_rq(nvme_tcp_tagset(queue
), pdu
->command_id
);
451 dev_err(queue
->ctrl
->ctrl
.device
,
452 "queue %d tag %#x not found\n",
453 nvme_tcp_queue_id(queue
), pdu
->command_id
);
457 if (!blk_rq_payload_bytes(rq
)) {
458 dev_err(queue
->ctrl
->ctrl
.device
,
459 "queue %d tag %#x unexpected data\n",
460 nvme_tcp_queue_id(queue
), rq
->tag
);
464 queue
->data_remaining
= le32_to_cpu(pdu
->data_length
);
470 static int nvme_tcp_handle_comp(struct nvme_tcp_queue
*queue
,
471 struct nvme_tcp_rsp_pdu
*pdu
)
473 struct nvme_completion
*cqe
= &pdu
->cqe
;
477 * AEN requests are special as they don't time out and can
478 * survive any kind of queue freeze and often don't respond to
479 * aborts. We don't even bother to allocate a struct request
480 * for them but rather special case them here.
482 if (unlikely(nvme_tcp_queue_id(queue
) == 0 &&
483 cqe
->command_id
>= NVME_AQ_BLK_MQ_DEPTH
))
484 nvme_complete_async_event(&queue
->ctrl
->ctrl
, cqe
->status
,
487 ret
= nvme_tcp_process_nvme_cqe(queue
, cqe
);
492 static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request
*req
,
493 struct nvme_tcp_r2t_pdu
*pdu
)
495 struct nvme_tcp_data_pdu
*data
= req
->pdu
;
496 struct nvme_tcp_queue
*queue
= req
->queue
;
497 struct request
*rq
= blk_mq_rq_from_pdu(req
);
498 u8 hdgst
= nvme_tcp_hdgst_len(queue
);
499 u8 ddgst
= nvme_tcp_ddgst_len(queue
);
501 req
->pdu_len
= le32_to_cpu(pdu
->r2t_length
);
504 if (unlikely(req
->data_sent
+ req
->pdu_len
> req
->data_len
)) {
505 dev_err(queue
->ctrl
->ctrl
.device
,
506 "req %d r2t len %u exceeded data len %u (%zu sent)\n",
507 rq
->tag
, req
->pdu_len
, req
->data_len
,
512 if (unlikely(le32_to_cpu(pdu
->r2t_offset
) < req
->data_sent
)) {
513 dev_err(queue
->ctrl
->ctrl
.device
,
514 "req %d unexpected r2t offset %u (expected %zu)\n",
515 rq
->tag
, le32_to_cpu(pdu
->r2t_offset
),
520 memset(data
, 0, sizeof(*data
));
521 data
->hdr
.type
= nvme_tcp_h2c_data
;
522 data
->hdr
.flags
= NVME_TCP_F_DATA_LAST
;
523 if (queue
->hdr_digest
)
524 data
->hdr
.flags
|= NVME_TCP_F_HDGST
;
525 if (queue
->data_digest
)
526 data
->hdr
.flags
|= NVME_TCP_F_DDGST
;
527 data
->hdr
.hlen
= sizeof(*data
);
528 data
->hdr
.pdo
= data
->hdr
.hlen
+ hdgst
;
530 cpu_to_le32(data
->hdr
.hlen
+ hdgst
+ req
->pdu_len
+ ddgst
);
531 data
->ttag
= pdu
->ttag
;
532 data
->command_id
= rq
->tag
;
533 data
->data_offset
= cpu_to_le32(req
->data_sent
);
534 data
->data_length
= cpu_to_le32(req
->pdu_len
);
538 static int nvme_tcp_handle_r2t(struct nvme_tcp_queue
*queue
,
539 struct nvme_tcp_r2t_pdu
*pdu
)
541 struct nvme_tcp_request
*req
;
545 rq
= blk_mq_tag_to_rq(nvme_tcp_tagset(queue
), pdu
->command_id
);
547 dev_err(queue
->ctrl
->ctrl
.device
,
548 "queue %d tag %#x not found\n",
549 nvme_tcp_queue_id(queue
), pdu
->command_id
);
552 req
= blk_mq_rq_to_pdu(rq
);
554 ret
= nvme_tcp_setup_h2c_data_pdu(req
, pdu
);
558 req
->state
= NVME_TCP_SEND_H2C_PDU
;
561 nvme_tcp_queue_request(req
);
566 static int nvme_tcp_recv_pdu(struct nvme_tcp_queue
*queue
, struct sk_buff
*skb
,
567 unsigned int *offset
, size_t *len
)
569 struct nvme_tcp_hdr
*hdr
;
570 char *pdu
= queue
->pdu
;
571 size_t rcv_len
= min_t(size_t, *len
, queue
->pdu_remaining
);
574 ret
= skb_copy_bits(skb
, *offset
,
575 &pdu
[queue
->pdu_offset
], rcv_len
);
579 queue
->pdu_remaining
-= rcv_len
;
580 queue
->pdu_offset
+= rcv_len
;
583 if (queue
->pdu_remaining
)
587 if (queue
->hdr_digest
) {
588 ret
= nvme_tcp_verify_hdgst(queue
, queue
->pdu
, hdr
->hlen
);
594 if (queue
->data_digest
) {
595 ret
= nvme_tcp_check_ddgst(queue
, queue
->pdu
);
601 case nvme_tcp_c2h_data
:
602 ret
= nvme_tcp_handle_c2h_data(queue
, (void *)queue
->pdu
);
605 nvme_tcp_init_recv_ctx(queue
);
606 ret
= nvme_tcp_handle_comp(queue
, (void *)queue
->pdu
);
609 nvme_tcp_init_recv_ctx(queue
);
610 ret
= nvme_tcp_handle_r2t(queue
, (void *)queue
->pdu
);
613 dev_err(queue
->ctrl
->ctrl
.device
,
614 "unsupported pdu type (%d)\n", hdr
->type
);
621 static int nvme_tcp_recv_data(struct nvme_tcp_queue
*queue
, struct sk_buff
*skb
,
622 unsigned int *offset
, size_t *len
)
624 struct nvme_tcp_data_pdu
*pdu
= (void *)queue
->pdu
;
625 struct nvme_tcp_request
*req
;
628 rq
= blk_mq_tag_to_rq(nvme_tcp_tagset(queue
), pdu
->command_id
);
630 dev_err(queue
->ctrl
->ctrl
.device
,
631 "queue %d tag %#x not found\n",
632 nvme_tcp_queue_id(queue
), pdu
->command_id
);
635 req
= blk_mq_rq_to_pdu(rq
);
640 recv_len
= min_t(size_t, *len
, queue
->data_remaining
);
644 if (!iov_iter_count(&req
->iter
)) {
645 req
->curr_bio
= req
->curr_bio
->bi_next
;
648 * If we don`t have any bios it means that controller
649 * sent more data than we requested, hence error
651 if (!req
->curr_bio
) {
652 dev_err(queue
->ctrl
->ctrl
.device
,
653 "queue %d no space in request %#x",
654 nvme_tcp_queue_id(queue
), rq
->tag
);
655 nvme_tcp_init_recv_ctx(queue
);
658 nvme_tcp_init_iter(req
, READ
);
661 /* we can read only from what is left in this bio */
662 recv_len
= min_t(size_t, recv_len
,
663 iov_iter_count(&req
->iter
));
665 if (queue
->data_digest
)
666 ret
= skb_copy_and_hash_datagram_iter(skb
, *offset
,
667 &req
->iter
, recv_len
, queue
->rcv_hash
);
669 ret
= skb_copy_datagram_iter(skb
, *offset
,
670 &req
->iter
, recv_len
);
672 dev_err(queue
->ctrl
->ctrl
.device
,
673 "queue %d failed to copy request %#x data",
674 nvme_tcp_queue_id(queue
), rq
->tag
);
680 queue
->data_remaining
-= recv_len
;
683 if (!queue
->data_remaining
) {
684 if (queue
->data_digest
) {
685 nvme_tcp_ddgst_final(queue
->rcv_hash
, &queue
->exp_ddgst
);
686 queue
->ddgst_remaining
= NVME_TCP_DIGEST_LENGTH
;
688 nvme_tcp_init_recv_ctx(queue
);
695 static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue
*queue
,
696 struct sk_buff
*skb
, unsigned int *offset
, size_t *len
)
698 char *ddgst
= (char *)&queue
->recv_ddgst
;
699 size_t recv_len
= min_t(size_t, *len
, queue
->ddgst_remaining
);
700 off_t off
= NVME_TCP_DIGEST_LENGTH
- queue
->ddgst_remaining
;
703 ret
= skb_copy_bits(skb
, *offset
, &ddgst
[off
], recv_len
);
707 queue
->ddgst_remaining
-= recv_len
;
710 if (queue
->ddgst_remaining
)
713 if (queue
->recv_ddgst
!= queue
->exp_ddgst
) {
714 dev_err(queue
->ctrl
->ctrl
.device
,
715 "data digest error: recv %#x expected %#x\n",
716 le32_to_cpu(queue
->recv_ddgst
),
717 le32_to_cpu(queue
->exp_ddgst
));
721 nvme_tcp_init_recv_ctx(queue
);
725 static int nvme_tcp_recv_skb(read_descriptor_t
*desc
, struct sk_buff
*skb
,
726 unsigned int offset
, size_t len
)
728 struct nvme_tcp_queue
*queue
= desc
->arg
.data
;
729 size_t consumed
= len
;
733 switch (nvme_tcp_recv_state(queue
)) {
734 case NVME_TCP_RECV_PDU
:
735 result
= nvme_tcp_recv_pdu(queue
, skb
, &offset
, &len
);
737 case NVME_TCP_RECV_DATA
:
738 result
= nvme_tcp_recv_data(queue
, skb
, &offset
, &len
);
740 case NVME_TCP_RECV_DDGST
:
741 result
= nvme_tcp_recv_ddgst(queue
, skb
, &offset
, &len
);
747 dev_err(queue
->ctrl
->ctrl
.device
,
748 "receive failed: %d\n", result
);
749 queue
->rd_enabled
= false;
750 nvme_tcp_error_recovery(&queue
->ctrl
->ctrl
);
758 static void nvme_tcp_data_ready(struct sock
*sk
)
760 struct nvme_tcp_queue
*queue
;
762 read_lock(&sk
->sk_callback_lock
);
763 queue
= sk
->sk_user_data
;
764 if (likely(queue
&& queue
->rd_enabled
))
765 queue_work_on(queue
->io_cpu
, nvme_tcp_wq
, &queue
->io_work
);
766 read_unlock(&sk
->sk_callback_lock
);
769 static void nvme_tcp_write_space(struct sock
*sk
)
771 struct nvme_tcp_queue
*queue
;
773 read_lock_bh(&sk
->sk_callback_lock
);
774 queue
= sk
->sk_user_data
;
775 if (likely(queue
&& sk_stream_is_writeable(sk
))) {
776 clear_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
777 queue_work_on(queue
->io_cpu
, nvme_tcp_wq
, &queue
->io_work
);
779 read_unlock_bh(&sk
->sk_callback_lock
);
782 static void nvme_tcp_state_change(struct sock
*sk
)
784 struct nvme_tcp_queue
*queue
;
786 read_lock(&sk
->sk_callback_lock
);
787 queue
= sk
->sk_user_data
;
791 switch (sk
->sk_state
) {
798 nvme_tcp_error_recovery(&queue
->ctrl
->ctrl
);
801 dev_info(queue
->ctrl
->ctrl
.device
,
802 "queue %d socket state %d\n",
803 nvme_tcp_queue_id(queue
), sk
->sk_state
);
806 queue
->state_change(sk
);
808 read_unlock(&sk
->sk_callback_lock
);
811 static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue
*queue
)
813 queue
->request
= NULL
;
816 static void nvme_tcp_fail_request(struct nvme_tcp_request
*req
)
818 union nvme_result res
= {};
820 nvme_end_request(blk_mq_rq_from_pdu(req
),
821 cpu_to_le16(NVME_SC_DATA_XFER_ERROR
), res
);
824 static int nvme_tcp_try_send_data(struct nvme_tcp_request
*req
)
826 struct nvme_tcp_queue
*queue
= req
->queue
;
829 struct page
*page
= nvme_tcp_req_cur_page(req
);
830 size_t offset
= nvme_tcp_req_cur_offset(req
);
831 size_t len
= nvme_tcp_req_cur_length(req
);
832 bool last
= nvme_tcp_pdu_last_send(req
, len
);
833 int ret
, flags
= MSG_DONTWAIT
;
835 if (last
&& !queue
->data_digest
)
840 ret
= kernel_sendpage(queue
->sock
, page
, offset
, len
, flags
);
844 nvme_tcp_advance_req(req
, ret
);
845 if (queue
->data_digest
)
846 nvme_tcp_ddgst_update(queue
->snd_hash
, page
,
849 /* fully successful last write*/
850 if (last
&& ret
== len
) {
851 if (queue
->data_digest
) {
852 nvme_tcp_ddgst_final(queue
->snd_hash
,
854 req
->state
= NVME_TCP_SEND_DDGST
;
857 nvme_tcp_done_send_req(queue
);
865 static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request
*req
)
867 struct nvme_tcp_queue
*queue
= req
->queue
;
868 struct nvme_tcp_cmd_pdu
*pdu
= req
->pdu
;
869 bool inline_data
= nvme_tcp_has_inline_data(req
);
870 int flags
= MSG_DONTWAIT
| (inline_data
? MSG_MORE
: MSG_EOR
);
871 u8 hdgst
= nvme_tcp_hdgst_len(queue
);
872 int len
= sizeof(*pdu
) + hdgst
- req
->offset
;
875 if (queue
->hdr_digest
&& !req
->offset
)
876 nvme_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
878 ret
= kernel_sendpage(queue
->sock
, virt_to_page(pdu
),
879 offset_in_page(pdu
) + req
->offset
, len
, flags
);
880 if (unlikely(ret
<= 0))
886 req
->state
= NVME_TCP_SEND_DATA
;
887 if (queue
->data_digest
)
888 crypto_ahash_init(queue
->snd_hash
);
889 nvme_tcp_init_iter(req
, WRITE
);
891 nvme_tcp_done_send_req(queue
);
900 static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request
*req
)
902 struct nvme_tcp_queue
*queue
= req
->queue
;
903 struct nvme_tcp_data_pdu
*pdu
= req
->pdu
;
904 u8 hdgst
= nvme_tcp_hdgst_len(queue
);
905 int len
= sizeof(*pdu
) - req
->offset
+ hdgst
;
908 if (queue
->hdr_digest
&& !req
->offset
)
909 nvme_tcp_hdgst(queue
->snd_hash
, pdu
, sizeof(*pdu
));
911 ret
= kernel_sendpage(queue
->sock
, virt_to_page(pdu
),
912 offset_in_page(pdu
) + req
->offset
, len
,
913 MSG_DONTWAIT
| MSG_MORE
);
914 if (unlikely(ret
<= 0))
919 req
->state
= NVME_TCP_SEND_DATA
;
920 if (queue
->data_digest
)
921 crypto_ahash_init(queue
->snd_hash
);
923 nvme_tcp_init_iter(req
, WRITE
);
931 static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request
*req
)
933 struct nvme_tcp_queue
*queue
= req
->queue
;
935 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
| MSG_EOR
};
937 .iov_base
= &req
->ddgst
+ req
->offset
,
938 .iov_len
= NVME_TCP_DIGEST_LENGTH
- req
->offset
941 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
942 if (unlikely(ret
<= 0))
945 if (req
->offset
+ ret
== NVME_TCP_DIGEST_LENGTH
) {
946 nvme_tcp_done_send_req(queue
);
954 static int nvme_tcp_try_send(struct nvme_tcp_queue
*queue
)
956 struct nvme_tcp_request
*req
;
959 if (!queue
->request
) {
960 queue
->request
= nvme_tcp_fetch_request(queue
);
964 req
= queue
->request
;
966 if (req
->state
== NVME_TCP_SEND_CMD_PDU
) {
967 ret
= nvme_tcp_try_send_cmd_pdu(req
);
970 if (!nvme_tcp_has_inline_data(req
))
974 if (req
->state
== NVME_TCP_SEND_H2C_PDU
) {
975 ret
= nvme_tcp_try_send_data_pdu(req
);
980 if (req
->state
== NVME_TCP_SEND_DATA
) {
981 ret
= nvme_tcp_try_send_data(req
);
986 if (req
->state
== NVME_TCP_SEND_DDGST
)
987 ret
= nvme_tcp_try_send_ddgst(req
);
994 static int nvme_tcp_try_recv(struct nvme_tcp_queue
*queue
)
996 struct sock
*sk
= queue
->sock
->sk
;
997 read_descriptor_t rd_desc
;
1000 rd_desc
.arg
.data
= queue
;
1003 consumed
= tcp_read_sock(sk
, &rd_desc
, nvme_tcp_recv_skb
);
1008 static void nvme_tcp_io_work(struct work_struct
*w
)
1010 struct nvme_tcp_queue
*queue
=
1011 container_of(w
, struct nvme_tcp_queue
, io_work
);
1012 unsigned long start
= jiffies
+ msecs_to_jiffies(1);
1015 bool pending
= false;
1018 result
= nvme_tcp_try_send(queue
);
1021 } else if (unlikely(result
< 0)) {
1022 dev_err(queue
->ctrl
->ctrl
.device
,
1023 "failed to send request %d\n", result
);
1024 if (result
!= -EPIPE
)
1025 nvme_tcp_fail_request(queue
->request
);
1026 nvme_tcp_done_send_req(queue
);
1030 result
= nvme_tcp_try_recv(queue
);
1037 } while (time_after(jiffies
, start
)); /* quota is exhausted */
1039 queue_work_on(queue
->io_cpu
, nvme_tcp_wq
, &queue
->io_work
);
1042 static void nvme_tcp_free_crypto(struct nvme_tcp_queue
*queue
)
1044 struct crypto_ahash
*tfm
= crypto_ahash_reqtfm(queue
->rcv_hash
);
1046 ahash_request_free(queue
->rcv_hash
);
1047 ahash_request_free(queue
->snd_hash
);
1048 crypto_free_ahash(tfm
);
1051 static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue
*queue
)
1053 struct crypto_ahash
*tfm
;
1055 tfm
= crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC
);
1057 return PTR_ERR(tfm
);
1059 queue
->snd_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
1060 if (!queue
->snd_hash
)
1062 ahash_request_set_callback(queue
->snd_hash
, 0, NULL
, NULL
);
1064 queue
->rcv_hash
= ahash_request_alloc(tfm
, GFP_KERNEL
);
1065 if (!queue
->rcv_hash
)
1067 ahash_request_set_callback(queue
->rcv_hash
, 0, NULL
, NULL
);
1071 ahash_request_free(queue
->snd_hash
);
1073 crypto_free_ahash(tfm
);
1077 static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl
*ctrl
)
1079 struct nvme_tcp_request
*async
= &ctrl
->async_req
;
1081 page_frag_free(async
->pdu
);
1084 static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl
*ctrl
)
1086 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[0];
1087 struct nvme_tcp_request
*async
= &ctrl
->async_req
;
1088 u8 hdgst
= nvme_tcp_hdgst_len(queue
);
1090 async
->pdu
= page_frag_alloc(&queue
->pf_cache
,
1091 sizeof(struct nvme_tcp_cmd_pdu
) + hdgst
,
1092 GFP_KERNEL
| __GFP_ZERO
);
1096 async
->queue
= &ctrl
->queues
[0];
1100 static void nvme_tcp_free_queue(struct nvme_ctrl
*nctrl
, int qid
)
1102 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(nctrl
);
1103 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[qid
];
1105 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED
, &queue
->flags
))
1108 if (queue
->hdr_digest
|| queue
->data_digest
)
1109 nvme_tcp_free_crypto(queue
);
1111 sock_release(queue
->sock
);
1115 static int nvme_tcp_init_connection(struct nvme_tcp_queue
*queue
)
1117 struct nvme_tcp_icreq_pdu
*icreq
;
1118 struct nvme_tcp_icresp_pdu
*icresp
;
1119 struct msghdr msg
= {};
1121 bool ctrl_hdgst
, ctrl_ddgst
;
1124 icreq
= kzalloc(sizeof(*icreq
), GFP_KERNEL
);
1128 icresp
= kzalloc(sizeof(*icresp
), GFP_KERNEL
);
1134 icreq
->hdr
.type
= nvme_tcp_icreq
;
1135 icreq
->hdr
.hlen
= sizeof(*icreq
);
1137 icreq
->hdr
.plen
= cpu_to_le32(icreq
->hdr
.hlen
);
1138 icreq
->pfv
= cpu_to_le16(NVME_TCP_PFV_1_0
);
1139 icreq
->maxr2t
= 0; /* single inflight r2t supported */
1140 icreq
->hpda
= 0; /* no alignment constraint */
1141 if (queue
->hdr_digest
)
1142 icreq
->digest
|= NVME_TCP_HDR_DIGEST_ENABLE
;
1143 if (queue
->data_digest
)
1144 icreq
->digest
|= NVME_TCP_DATA_DIGEST_ENABLE
;
1146 iov
.iov_base
= icreq
;
1147 iov
.iov_len
= sizeof(*icreq
);
1148 ret
= kernel_sendmsg(queue
->sock
, &msg
, &iov
, 1, iov
.iov_len
);
1152 memset(&msg
, 0, sizeof(msg
));
1153 iov
.iov_base
= icresp
;
1154 iov
.iov_len
= sizeof(*icresp
);
1155 ret
= kernel_recvmsg(queue
->sock
, &msg
, &iov
, 1,
1156 iov
.iov_len
, msg
.msg_flags
);
1161 if (icresp
->hdr
.type
!= nvme_tcp_icresp
) {
1162 pr_err("queue %d: bad type returned %d\n",
1163 nvme_tcp_queue_id(queue
), icresp
->hdr
.type
);
1167 if (le32_to_cpu(icresp
->hdr
.plen
) != sizeof(*icresp
)) {
1168 pr_err("queue %d: bad pdu length returned %d\n",
1169 nvme_tcp_queue_id(queue
), icresp
->hdr
.plen
);
1173 if (icresp
->pfv
!= NVME_TCP_PFV_1_0
) {
1174 pr_err("queue %d: bad pfv returned %d\n",
1175 nvme_tcp_queue_id(queue
), icresp
->pfv
);
1179 ctrl_ddgst
= !!(icresp
->digest
& NVME_TCP_DATA_DIGEST_ENABLE
);
1180 if ((queue
->data_digest
&& !ctrl_ddgst
) ||
1181 (!queue
->data_digest
&& ctrl_ddgst
)) {
1182 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1183 nvme_tcp_queue_id(queue
),
1184 queue
->data_digest
? "enabled" : "disabled",
1185 ctrl_ddgst
? "enabled" : "disabled");
1189 ctrl_hdgst
= !!(icresp
->digest
& NVME_TCP_HDR_DIGEST_ENABLE
);
1190 if ((queue
->hdr_digest
&& !ctrl_hdgst
) ||
1191 (!queue
->hdr_digest
&& ctrl_hdgst
)) {
1192 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1193 nvme_tcp_queue_id(queue
),
1194 queue
->hdr_digest
? "enabled" : "disabled",
1195 ctrl_hdgst
? "enabled" : "disabled");
1199 if (icresp
->cpda
!= 0) {
1200 pr_err("queue %d: unsupported cpda returned %d\n",
1201 nvme_tcp_queue_id(queue
), icresp
->cpda
);
1213 static int nvme_tcp_alloc_queue(struct nvme_ctrl
*nctrl
,
1214 int qid
, size_t queue_size
)
1216 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(nctrl
);
1217 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[qid
];
1218 struct linger sol
= { .l_onoff
= 1, .l_linger
= 0 };
1219 int ret
, opt
, rcv_pdu_size
, n
;
1222 INIT_LIST_HEAD(&queue
->send_list
);
1223 spin_lock_init(&queue
->lock
);
1224 INIT_WORK(&queue
->io_work
, nvme_tcp_io_work
);
1225 queue
->queue_size
= queue_size
;
1228 queue
->cmnd_capsule_len
= ctrl
->ctrl
.ioccsz
* 16;
1230 queue
->cmnd_capsule_len
= sizeof(struct nvme_command
) +
1231 NVME_TCP_ADMIN_CCSZ
;
1233 ret
= sock_create(ctrl
->addr
.ss_family
, SOCK_STREAM
,
1234 IPPROTO_TCP
, &queue
->sock
);
1236 dev_err(ctrl
->ctrl
.device
,
1237 "failed to create socket: %d\n", ret
);
1241 /* Single syn retry */
1243 ret
= kernel_setsockopt(queue
->sock
, IPPROTO_TCP
, TCP_SYNCNT
,
1244 (char *)&opt
, sizeof(opt
));
1246 dev_err(ctrl
->ctrl
.device
,
1247 "failed to set TCP_SYNCNT sock opt %d\n", ret
);
1251 /* Set TCP no delay */
1253 ret
= kernel_setsockopt(queue
->sock
, IPPROTO_TCP
,
1254 TCP_NODELAY
, (char *)&opt
, sizeof(opt
));
1256 dev_err(ctrl
->ctrl
.device
,
1257 "failed to set TCP_NODELAY sock opt %d\n", ret
);
1262 * Cleanup whatever is sitting in the TCP transmit queue on socket
1263 * close. This is done to prevent stale data from being sent should
1264 * the network connection be restored before TCP times out.
1266 ret
= kernel_setsockopt(queue
->sock
, SOL_SOCKET
, SO_LINGER
,
1267 (char *)&sol
, sizeof(sol
));
1269 dev_err(ctrl
->ctrl
.device
,
1270 "failed to set SO_LINGER sock opt %d\n", ret
);
1274 queue
->sock
->sk
->sk_allocation
= GFP_ATOMIC
;
1278 n
= (qid
- 1) % num_online_cpus();
1279 queue
->io_cpu
= cpumask_next_wrap(n
- 1, cpu_online_mask
, -1, false);
1280 queue
->request
= NULL
;
1281 queue
->data_remaining
= 0;
1282 queue
->ddgst_remaining
= 0;
1283 queue
->pdu_remaining
= 0;
1284 queue
->pdu_offset
= 0;
1285 sk_set_memalloc(queue
->sock
->sk
);
1287 if (ctrl
->ctrl
.opts
->mask
& NVMF_OPT_HOST_TRADDR
) {
1288 ret
= kernel_bind(queue
->sock
, (struct sockaddr
*)&ctrl
->src_addr
,
1289 sizeof(ctrl
->src_addr
));
1291 dev_err(ctrl
->ctrl
.device
,
1292 "failed to bind queue %d socket %d\n",
1298 queue
->hdr_digest
= nctrl
->opts
->hdr_digest
;
1299 queue
->data_digest
= nctrl
->opts
->data_digest
;
1300 if (queue
->hdr_digest
|| queue
->data_digest
) {
1301 ret
= nvme_tcp_alloc_crypto(queue
);
1303 dev_err(ctrl
->ctrl
.device
,
1304 "failed to allocate queue %d crypto\n", qid
);
1309 rcv_pdu_size
= sizeof(struct nvme_tcp_rsp_pdu
) +
1310 nvme_tcp_hdgst_len(queue
);
1311 queue
->pdu
= kmalloc(rcv_pdu_size
, GFP_KERNEL
);
1317 dev_dbg(ctrl
->ctrl
.device
, "connecting queue %d\n",
1318 nvme_tcp_queue_id(queue
));
1320 ret
= kernel_connect(queue
->sock
, (struct sockaddr
*)&ctrl
->addr
,
1321 sizeof(ctrl
->addr
), 0);
1323 dev_err(ctrl
->ctrl
.device
,
1324 "failed to connect socket: %d\n", ret
);
1328 ret
= nvme_tcp_init_connection(queue
);
1330 goto err_init_connect
;
1332 queue
->rd_enabled
= true;
1333 set_bit(NVME_TCP_Q_ALLOCATED
, &queue
->flags
);
1334 nvme_tcp_init_recv_ctx(queue
);
1336 write_lock_bh(&queue
->sock
->sk
->sk_callback_lock
);
1337 queue
->sock
->sk
->sk_user_data
= queue
;
1338 queue
->state_change
= queue
->sock
->sk
->sk_state_change
;
1339 queue
->data_ready
= queue
->sock
->sk
->sk_data_ready
;
1340 queue
->write_space
= queue
->sock
->sk
->sk_write_space
;
1341 queue
->sock
->sk
->sk_data_ready
= nvme_tcp_data_ready
;
1342 queue
->sock
->sk
->sk_state_change
= nvme_tcp_state_change
;
1343 queue
->sock
->sk
->sk_write_space
= nvme_tcp_write_space
;
1344 write_unlock_bh(&queue
->sock
->sk
->sk_callback_lock
);
1349 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1353 if (queue
->hdr_digest
|| queue
->data_digest
)
1354 nvme_tcp_free_crypto(queue
);
1356 sock_release(queue
->sock
);
1361 static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue
*queue
)
1363 struct socket
*sock
= queue
->sock
;
1365 write_lock_bh(&sock
->sk
->sk_callback_lock
);
1366 sock
->sk
->sk_user_data
= NULL
;
1367 sock
->sk
->sk_data_ready
= queue
->data_ready
;
1368 sock
->sk
->sk_state_change
= queue
->state_change
;
1369 sock
->sk
->sk_write_space
= queue
->write_space
;
1370 write_unlock_bh(&sock
->sk
->sk_callback_lock
);
1373 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue
*queue
)
1375 kernel_sock_shutdown(queue
->sock
, SHUT_RDWR
);
1376 nvme_tcp_restore_sock_calls(queue
);
1377 cancel_work_sync(&queue
->io_work
);
1380 static void nvme_tcp_stop_queue(struct nvme_ctrl
*nctrl
, int qid
)
1382 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(nctrl
);
1383 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[qid
];
1385 if (!test_and_clear_bit(NVME_TCP_Q_LIVE
, &queue
->flags
))
1388 __nvme_tcp_stop_queue(queue
);
1391 static int nvme_tcp_start_queue(struct nvme_ctrl
*nctrl
, int idx
)
1393 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(nctrl
);
1397 ret
= nvmf_connect_io_queue(nctrl
, idx
, false);
1399 ret
= nvmf_connect_admin_queue(nctrl
);
1402 set_bit(NVME_TCP_Q_LIVE
, &ctrl
->queues
[idx
].flags
);
1404 __nvme_tcp_stop_queue(&ctrl
->queues
[idx
]);
1405 dev_err(nctrl
->device
,
1406 "failed to connect queue: %d ret=%d\n", idx
, ret
);
1411 static struct blk_mq_tag_set
*nvme_tcp_alloc_tagset(struct nvme_ctrl
*nctrl
,
1414 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(nctrl
);
1415 struct blk_mq_tag_set
*set
;
1419 set
= &ctrl
->admin_tag_set
;
1420 memset(set
, 0, sizeof(*set
));
1421 set
->ops
= &nvme_tcp_admin_mq_ops
;
1422 set
->queue_depth
= NVME_AQ_MQ_TAG_DEPTH
;
1423 set
->reserved_tags
= 2; /* connect + keep-alive */
1424 set
->numa_node
= NUMA_NO_NODE
;
1425 set
->cmd_size
= sizeof(struct nvme_tcp_request
);
1426 set
->driver_data
= ctrl
;
1427 set
->nr_hw_queues
= 1;
1428 set
->timeout
= ADMIN_TIMEOUT
;
1430 set
= &ctrl
->tag_set
;
1431 memset(set
, 0, sizeof(*set
));
1432 set
->ops
= &nvme_tcp_mq_ops
;
1433 set
->queue_depth
= nctrl
->sqsize
+ 1;
1434 set
->reserved_tags
= 1; /* fabric connect */
1435 set
->numa_node
= NUMA_NO_NODE
;
1436 set
->flags
= BLK_MQ_F_SHOULD_MERGE
;
1437 set
->cmd_size
= sizeof(struct nvme_tcp_request
);
1438 set
->driver_data
= ctrl
;
1439 set
->nr_hw_queues
= nctrl
->queue_count
- 1;
1440 set
->timeout
= NVME_IO_TIMEOUT
;
1441 set
->nr_maps
= 2 /* default + read */;
1444 ret
= blk_mq_alloc_tag_set(set
);
1446 return ERR_PTR(ret
);
1451 static void nvme_tcp_free_admin_queue(struct nvme_ctrl
*ctrl
)
1453 if (to_tcp_ctrl(ctrl
)->async_req
.pdu
) {
1454 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl
));
1455 to_tcp_ctrl(ctrl
)->async_req
.pdu
= NULL
;
1458 nvme_tcp_free_queue(ctrl
, 0);
1461 static void nvme_tcp_free_io_queues(struct nvme_ctrl
*ctrl
)
1465 for (i
= 1; i
< ctrl
->queue_count
; i
++)
1466 nvme_tcp_free_queue(ctrl
, i
);
1469 static void nvme_tcp_stop_io_queues(struct nvme_ctrl
*ctrl
)
1473 for (i
= 1; i
< ctrl
->queue_count
; i
++)
1474 nvme_tcp_stop_queue(ctrl
, i
);
1477 static int nvme_tcp_start_io_queues(struct nvme_ctrl
*ctrl
)
1481 for (i
= 1; i
< ctrl
->queue_count
; i
++) {
1482 ret
= nvme_tcp_start_queue(ctrl
, i
);
1484 goto out_stop_queues
;
1490 for (i
--; i
>= 1; i
--)
1491 nvme_tcp_stop_queue(ctrl
, i
);
1495 static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl
*ctrl
)
1499 ret
= nvme_tcp_alloc_queue(ctrl
, 0, NVME_AQ_DEPTH
);
1503 ret
= nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl
));
1505 goto out_free_queue
;
1510 nvme_tcp_free_queue(ctrl
, 0);
1514 static int nvme_tcp_alloc_io_queues(struct nvme_ctrl
*ctrl
)
1518 for (i
= 1; i
< ctrl
->queue_count
; i
++) {
1519 ret
= nvme_tcp_alloc_queue(ctrl
, i
,
1522 goto out_free_queues
;
1528 for (i
--; i
>= 1; i
--)
1529 nvme_tcp_free_queue(ctrl
, i
);
1534 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl
*ctrl
)
1536 unsigned int nr_io_queues
;
1538 nr_io_queues
= min(ctrl
->opts
->nr_io_queues
, num_online_cpus());
1539 nr_io_queues
+= min(ctrl
->opts
->nr_write_queues
, num_online_cpus());
1541 return nr_io_queues
;
1544 static int nvme_alloc_io_queues(struct nvme_ctrl
*ctrl
)
1546 unsigned int nr_io_queues
;
1549 nr_io_queues
= nvme_tcp_nr_io_queues(ctrl
);
1550 ret
= nvme_set_queue_count(ctrl
, &nr_io_queues
);
1554 ctrl
->queue_count
= nr_io_queues
+ 1;
1555 if (ctrl
->queue_count
< 2)
1558 dev_info(ctrl
->device
,
1559 "creating %d I/O queues.\n", nr_io_queues
);
1561 return nvme_tcp_alloc_io_queues(ctrl
);
1564 static void nvme_tcp_destroy_io_queues(struct nvme_ctrl
*ctrl
, bool remove
)
1566 nvme_tcp_stop_io_queues(ctrl
);
1568 blk_cleanup_queue(ctrl
->connect_q
);
1569 blk_mq_free_tag_set(ctrl
->tagset
);
1571 nvme_tcp_free_io_queues(ctrl
);
1574 static int nvme_tcp_configure_io_queues(struct nvme_ctrl
*ctrl
, bool new)
1578 ret
= nvme_alloc_io_queues(ctrl
);
1583 ctrl
->tagset
= nvme_tcp_alloc_tagset(ctrl
, false);
1584 if (IS_ERR(ctrl
->tagset
)) {
1585 ret
= PTR_ERR(ctrl
->tagset
);
1586 goto out_free_io_queues
;
1589 ctrl
->connect_q
= blk_mq_init_queue(ctrl
->tagset
);
1590 if (IS_ERR(ctrl
->connect_q
)) {
1591 ret
= PTR_ERR(ctrl
->connect_q
);
1592 goto out_free_tag_set
;
1595 blk_mq_update_nr_hw_queues(ctrl
->tagset
,
1596 ctrl
->queue_count
- 1);
1599 ret
= nvme_tcp_start_io_queues(ctrl
);
1601 goto out_cleanup_connect_q
;
1605 out_cleanup_connect_q
:
1607 blk_cleanup_queue(ctrl
->connect_q
);
1610 blk_mq_free_tag_set(ctrl
->tagset
);
1612 nvme_tcp_free_io_queues(ctrl
);
1616 static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl
*ctrl
, bool remove
)
1618 nvme_tcp_stop_queue(ctrl
, 0);
1620 blk_cleanup_queue(ctrl
->admin_q
);
1621 blk_mq_free_tag_set(ctrl
->admin_tagset
);
1623 nvme_tcp_free_admin_queue(ctrl
);
1626 static int nvme_tcp_configure_admin_queue(struct nvme_ctrl
*ctrl
, bool new)
1630 error
= nvme_tcp_alloc_admin_queue(ctrl
);
1635 ctrl
->admin_tagset
= nvme_tcp_alloc_tagset(ctrl
, true);
1636 if (IS_ERR(ctrl
->admin_tagset
)) {
1637 error
= PTR_ERR(ctrl
->admin_tagset
);
1638 goto out_free_queue
;
1641 ctrl
->admin_q
= blk_mq_init_queue(ctrl
->admin_tagset
);
1642 if (IS_ERR(ctrl
->admin_q
)) {
1643 error
= PTR_ERR(ctrl
->admin_q
);
1644 goto out_free_tagset
;
1648 error
= nvme_tcp_start_queue(ctrl
, 0);
1650 goto out_cleanup_queue
;
1652 error
= ctrl
->ops
->reg_read64(ctrl
, NVME_REG_CAP
, &ctrl
->cap
);
1654 dev_err(ctrl
->device
,
1655 "prop_get NVME_REG_CAP failed\n");
1656 goto out_stop_queue
;
1659 ctrl
->sqsize
= min_t(int, NVME_CAP_MQES(ctrl
->cap
), ctrl
->sqsize
);
1661 error
= nvme_enable_ctrl(ctrl
, ctrl
->cap
);
1663 goto out_stop_queue
;
1665 error
= nvme_init_identify(ctrl
);
1667 goto out_stop_queue
;
1672 nvme_tcp_stop_queue(ctrl
, 0);
1675 blk_cleanup_queue(ctrl
->admin_q
);
1678 blk_mq_free_tag_set(ctrl
->admin_tagset
);
1680 nvme_tcp_free_admin_queue(ctrl
);
1684 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl
*ctrl
,
1687 blk_mq_quiesce_queue(ctrl
->admin_q
);
1688 nvme_tcp_stop_queue(ctrl
, 0);
1689 blk_mq_tagset_busy_iter(ctrl
->admin_tagset
, nvme_cancel_request
, ctrl
);
1690 blk_mq_unquiesce_queue(ctrl
->admin_q
);
1691 nvme_tcp_destroy_admin_queue(ctrl
, remove
);
1694 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl
*ctrl
,
1697 if (ctrl
->queue_count
<= 1)
1699 nvme_stop_queues(ctrl
);
1700 nvme_tcp_stop_io_queues(ctrl
);
1701 blk_mq_tagset_busy_iter(ctrl
->tagset
, nvme_cancel_request
, ctrl
);
1703 nvme_start_queues(ctrl
);
1704 nvme_tcp_destroy_io_queues(ctrl
, remove
);
1707 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl
*ctrl
)
1709 /* If we are resetting/deleting then do nothing */
1710 if (ctrl
->state
!= NVME_CTRL_CONNECTING
) {
1711 WARN_ON_ONCE(ctrl
->state
== NVME_CTRL_NEW
||
1712 ctrl
->state
== NVME_CTRL_LIVE
);
1716 if (nvmf_should_reconnect(ctrl
)) {
1717 dev_info(ctrl
->device
, "Reconnecting in %d seconds...\n",
1718 ctrl
->opts
->reconnect_delay
);
1719 queue_delayed_work(nvme_wq
, &to_tcp_ctrl(ctrl
)->connect_work
,
1720 ctrl
->opts
->reconnect_delay
* HZ
);
1722 dev_info(ctrl
->device
, "Removing controller...\n");
1723 nvme_delete_ctrl(ctrl
);
1727 static int nvme_tcp_setup_ctrl(struct nvme_ctrl
*ctrl
, bool new)
1729 struct nvmf_ctrl_options
*opts
= ctrl
->opts
;
1732 ret
= nvme_tcp_configure_admin_queue(ctrl
, new);
1737 dev_err(ctrl
->device
, "icdoff is not supported!\n");
1741 if (opts
->queue_size
> ctrl
->sqsize
+ 1)
1742 dev_warn(ctrl
->device
,
1743 "queue_size %zu > ctrl sqsize %u, clamping down\n",
1744 opts
->queue_size
, ctrl
->sqsize
+ 1);
1746 if (ctrl
->sqsize
+ 1 > ctrl
->maxcmd
) {
1747 dev_warn(ctrl
->device
,
1748 "sqsize %u > ctrl maxcmd %u, clamping down\n",
1749 ctrl
->sqsize
+ 1, ctrl
->maxcmd
);
1750 ctrl
->sqsize
= ctrl
->maxcmd
- 1;
1753 if (ctrl
->queue_count
> 1) {
1754 ret
= nvme_tcp_configure_io_queues(ctrl
, new);
1759 if (!nvme_change_ctrl_state(ctrl
, NVME_CTRL_LIVE
)) {
1760 /* state change failure is ok if we're in DELETING state */
1761 WARN_ON_ONCE(ctrl
->state
!= NVME_CTRL_DELETING
);
1766 nvme_start_ctrl(ctrl
);
1770 if (ctrl
->queue_count
> 1)
1771 nvme_tcp_destroy_io_queues(ctrl
, new);
1773 nvme_tcp_stop_queue(ctrl
, 0);
1774 nvme_tcp_destroy_admin_queue(ctrl
, new);
1778 static void nvme_tcp_reconnect_ctrl_work(struct work_struct
*work
)
1780 struct nvme_tcp_ctrl
*tcp_ctrl
= container_of(to_delayed_work(work
),
1781 struct nvme_tcp_ctrl
, connect_work
);
1782 struct nvme_ctrl
*ctrl
= &tcp_ctrl
->ctrl
;
1784 ++ctrl
->nr_reconnects
;
1786 if (nvme_tcp_setup_ctrl(ctrl
, false))
1789 dev_info(ctrl
->device
, "Successfully reconnected (%d attempt)\n",
1790 ctrl
->nr_reconnects
);
1792 ctrl
->nr_reconnects
= 0;
1797 dev_info(ctrl
->device
, "Failed reconnect attempt %d\n",
1798 ctrl
->nr_reconnects
);
1799 nvme_tcp_reconnect_or_remove(ctrl
);
1802 static void nvme_tcp_error_recovery_work(struct work_struct
*work
)
1804 struct nvme_tcp_ctrl
*tcp_ctrl
= container_of(work
,
1805 struct nvme_tcp_ctrl
, err_work
);
1806 struct nvme_ctrl
*ctrl
= &tcp_ctrl
->ctrl
;
1808 nvme_stop_keep_alive(ctrl
);
1809 nvme_tcp_teardown_io_queues(ctrl
, false);
1810 /* unquiesce to fail fast pending requests */
1811 nvme_start_queues(ctrl
);
1812 nvme_tcp_teardown_admin_queue(ctrl
, false);
1814 if (!nvme_change_ctrl_state(ctrl
, NVME_CTRL_CONNECTING
)) {
1815 /* state change failure is ok if we're in DELETING state */
1816 WARN_ON_ONCE(ctrl
->state
!= NVME_CTRL_DELETING
);
1820 nvme_tcp_reconnect_or_remove(ctrl
);
1823 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl
*ctrl
, bool shutdown
)
1825 nvme_tcp_teardown_io_queues(ctrl
, shutdown
);
1827 nvme_shutdown_ctrl(ctrl
);
1829 nvme_disable_ctrl(ctrl
, ctrl
->cap
);
1830 nvme_tcp_teardown_admin_queue(ctrl
, shutdown
);
1833 static void nvme_tcp_delete_ctrl(struct nvme_ctrl
*ctrl
)
1835 nvme_tcp_teardown_ctrl(ctrl
, true);
1838 static void nvme_reset_ctrl_work(struct work_struct
*work
)
1840 struct nvme_ctrl
*ctrl
=
1841 container_of(work
, struct nvme_ctrl
, reset_work
);
1843 nvme_stop_ctrl(ctrl
);
1844 nvme_tcp_teardown_ctrl(ctrl
, false);
1846 if (!nvme_change_ctrl_state(ctrl
, NVME_CTRL_CONNECTING
)) {
1847 /* state change failure is ok if we're in DELETING state */
1848 WARN_ON_ONCE(ctrl
->state
!= NVME_CTRL_DELETING
);
1852 if (nvme_tcp_setup_ctrl(ctrl
, false))
1858 ++ctrl
->nr_reconnects
;
1859 nvme_tcp_reconnect_or_remove(ctrl
);
1862 static void nvme_tcp_stop_ctrl(struct nvme_ctrl
*ctrl
)
1864 cancel_work_sync(&to_tcp_ctrl(ctrl
)->err_work
);
1865 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl
)->connect_work
);
1868 static void nvme_tcp_free_ctrl(struct nvme_ctrl
*nctrl
)
1870 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(nctrl
);
1872 if (list_empty(&ctrl
->list
))
1875 mutex_lock(&nvme_tcp_ctrl_mutex
);
1876 list_del(&ctrl
->list
);
1877 mutex_unlock(&nvme_tcp_ctrl_mutex
);
1879 nvmf_free_options(nctrl
->opts
);
1881 kfree(ctrl
->queues
);
1885 static void nvme_tcp_set_sg_null(struct nvme_command
*c
)
1887 struct nvme_sgl_desc
*sg
= &c
->common
.dptr
.sgl
;
1891 sg
->type
= (NVME_TRANSPORT_SGL_DATA_DESC
<< 4) |
1892 NVME_SGL_FMT_TRANSPORT_A
;
1895 static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue
*queue
,
1896 struct nvme_command
*c
, u32 data_len
)
1898 struct nvme_sgl_desc
*sg
= &c
->common
.dptr
.sgl
;
1900 sg
->addr
= cpu_to_le64(queue
->ctrl
->ctrl
.icdoff
);
1901 sg
->length
= cpu_to_le32(data_len
);
1902 sg
->type
= (NVME_SGL_FMT_DATA_DESC
<< 4) | NVME_SGL_FMT_OFFSET
;
1905 static void nvme_tcp_set_sg_host_data(struct nvme_command
*c
,
1908 struct nvme_sgl_desc
*sg
= &c
->common
.dptr
.sgl
;
1911 sg
->length
= cpu_to_le32(data_len
);
1912 sg
->type
= (NVME_TRANSPORT_SGL_DATA_DESC
<< 4) |
1913 NVME_SGL_FMT_TRANSPORT_A
;
1916 static void nvme_tcp_submit_async_event(struct nvme_ctrl
*arg
)
1918 struct nvme_tcp_ctrl
*ctrl
= to_tcp_ctrl(arg
);
1919 struct nvme_tcp_queue
*queue
= &ctrl
->queues
[0];
1920 struct nvme_tcp_cmd_pdu
*pdu
= ctrl
->async_req
.pdu
;
1921 struct nvme_command
*cmd
= &pdu
->cmd
;
1922 u8 hdgst
= nvme_tcp_hdgst_len(queue
);
1924 memset(pdu
, 0, sizeof(*pdu
));
1925 pdu
->hdr
.type
= nvme_tcp_cmd
;
1926 if (queue
->hdr_digest
)
1927 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
1928 pdu
->hdr
.hlen
= sizeof(*pdu
);
1929 pdu
->hdr
.plen
= cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
);
1931 cmd
->common
.opcode
= nvme_admin_async_event
;
1932 cmd
->common
.command_id
= NVME_AQ_BLK_MQ_DEPTH
;
1933 cmd
->common
.flags
|= NVME_CMD_SGL_METABUF
;
1934 nvme_tcp_set_sg_null(cmd
);
1936 ctrl
->async_req
.state
= NVME_TCP_SEND_CMD_PDU
;
1937 ctrl
->async_req
.offset
= 0;
1938 ctrl
->async_req
.curr_bio
= NULL
;
1939 ctrl
->async_req
.data_len
= 0;
1941 nvme_tcp_queue_request(&ctrl
->async_req
);
1944 static enum blk_eh_timer_return
1945 nvme_tcp_timeout(struct request
*rq
, bool reserved
)
1947 struct nvme_tcp_request
*req
= blk_mq_rq_to_pdu(rq
);
1948 struct nvme_tcp_ctrl
*ctrl
= req
->queue
->ctrl
;
1949 struct nvme_tcp_cmd_pdu
*pdu
= req
->pdu
;
1951 dev_warn(ctrl
->ctrl
.device
,
1952 "queue %d: timeout request %#x type %d\n",
1953 nvme_tcp_queue_id(req
->queue
), rq
->tag
, pdu
->hdr
.type
);
1955 if (ctrl
->ctrl
.state
!= NVME_CTRL_LIVE
) {
1957 * Teardown immediately if controller times out while starting
1958 * or we are already started error recovery. all outstanding
1959 * requests are completed on shutdown, so we return BLK_EH_DONE.
1961 flush_work(&ctrl
->err_work
);
1962 nvme_tcp_teardown_io_queues(&ctrl
->ctrl
, false);
1963 nvme_tcp_teardown_admin_queue(&ctrl
->ctrl
, false);
1967 dev_warn(ctrl
->ctrl
.device
, "starting error recovery\n");
1968 nvme_tcp_error_recovery(&ctrl
->ctrl
);
1970 return BLK_EH_RESET_TIMER
;
1973 static blk_status_t
nvme_tcp_map_data(struct nvme_tcp_queue
*queue
,
1976 struct nvme_tcp_request
*req
= blk_mq_rq_to_pdu(rq
);
1977 struct nvme_tcp_cmd_pdu
*pdu
= req
->pdu
;
1978 struct nvme_command
*c
= &pdu
->cmd
;
1980 c
->common
.flags
|= NVME_CMD_SGL_METABUF
;
1982 if (rq_data_dir(rq
) == WRITE
&& req
->data_len
&&
1983 req
->data_len
<= nvme_tcp_inline_data_size(queue
))
1984 nvme_tcp_set_sg_inline(queue
, c
, req
->data_len
);
1986 nvme_tcp_set_sg_host_data(c
, req
->data_len
);
1991 static blk_status_t
nvme_tcp_setup_cmd_pdu(struct nvme_ns
*ns
,
1994 struct nvme_tcp_request
*req
= blk_mq_rq_to_pdu(rq
);
1995 struct nvme_tcp_cmd_pdu
*pdu
= req
->pdu
;
1996 struct nvme_tcp_queue
*queue
= req
->queue
;
1997 u8 hdgst
= nvme_tcp_hdgst_len(queue
), ddgst
= 0;
2000 ret
= nvme_setup_cmd(ns
, rq
, &pdu
->cmd
);
2004 req
->state
= NVME_TCP_SEND_CMD_PDU
;
2009 req
->data_len
= blk_rq_payload_bytes(rq
);
2010 req
->curr_bio
= rq
->bio
;
2012 if (rq_data_dir(rq
) == WRITE
&&
2013 req
->data_len
<= nvme_tcp_inline_data_size(queue
))
2014 req
->pdu_len
= req
->data_len
;
2015 else if (req
->curr_bio
)
2016 nvme_tcp_init_iter(req
, READ
);
2018 pdu
->hdr
.type
= nvme_tcp_cmd
;
2020 if (queue
->hdr_digest
)
2021 pdu
->hdr
.flags
|= NVME_TCP_F_HDGST
;
2022 if (queue
->data_digest
&& req
->pdu_len
) {
2023 pdu
->hdr
.flags
|= NVME_TCP_F_DDGST
;
2024 ddgst
= nvme_tcp_ddgst_len(queue
);
2026 pdu
->hdr
.hlen
= sizeof(*pdu
);
2027 pdu
->hdr
.pdo
= req
->pdu_len
? pdu
->hdr
.hlen
+ hdgst
: 0;
2029 cpu_to_le32(pdu
->hdr
.hlen
+ hdgst
+ req
->pdu_len
+ ddgst
);
2031 ret
= nvme_tcp_map_data(queue
, rq
);
2032 if (unlikely(ret
)) {
2033 dev_err(queue
->ctrl
->ctrl
.device
,
2034 "Failed to map data (%d)\n", ret
);
2041 static blk_status_t
nvme_tcp_queue_rq(struct blk_mq_hw_ctx
*hctx
,
2042 const struct blk_mq_queue_data
*bd
)
2044 struct nvme_ns
*ns
= hctx
->queue
->queuedata
;
2045 struct nvme_tcp_queue
*queue
= hctx
->driver_data
;
2046 struct request
*rq
= bd
->rq
;
2047 struct nvme_tcp_request
*req
= blk_mq_rq_to_pdu(rq
);
2048 bool queue_ready
= test_bit(NVME_TCP_Q_LIVE
, &queue
->flags
);
2051 if (!nvmf_check_ready(&queue
->ctrl
->ctrl
, rq
, queue_ready
))
2052 return nvmf_fail_nonready_command(&queue
->ctrl
->ctrl
, rq
);
2054 ret
= nvme_tcp_setup_cmd_pdu(ns
, rq
);
2058 blk_mq_start_request(rq
);
2060 nvme_tcp_queue_request(req
);
2065 static int nvme_tcp_map_queues(struct blk_mq_tag_set
*set
)
2067 struct nvme_tcp_ctrl
*ctrl
= set
->driver_data
;
2069 set
->map
[HCTX_TYPE_DEFAULT
].queue_offset
= 0;
2070 set
->map
[HCTX_TYPE_READ
].nr_queues
= ctrl
->ctrl
.opts
->nr_io_queues
;
2071 if (ctrl
->ctrl
.opts
->nr_write_queues
) {
2072 /* separate read/write queues */
2073 set
->map
[HCTX_TYPE_DEFAULT
].nr_queues
=
2074 ctrl
->ctrl
.opts
->nr_write_queues
;
2075 set
->map
[HCTX_TYPE_READ
].queue_offset
=
2076 ctrl
->ctrl
.opts
->nr_write_queues
;
2078 /* mixed read/write queues */
2079 set
->map
[HCTX_TYPE_DEFAULT
].nr_queues
=
2080 ctrl
->ctrl
.opts
->nr_io_queues
;
2081 set
->map
[HCTX_TYPE_READ
].queue_offset
= 0;
2083 blk_mq_map_queues(&set
->map
[HCTX_TYPE_DEFAULT
]);
2084 blk_mq_map_queues(&set
->map
[HCTX_TYPE_READ
]);
2088 static struct blk_mq_ops nvme_tcp_mq_ops
= {
2089 .queue_rq
= nvme_tcp_queue_rq
,
2090 .complete
= nvme_complete_rq
,
2091 .init_request
= nvme_tcp_init_request
,
2092 .exit_request
= nvme_tcp_exit_request
,
2093 .init_hctx
= nvme_tcp_init_hctx
,
2094 .timeout
= nvme_tcp_timeout
,
2095 .map_queues
= nvme_tcp_map_queues
,
2098 static struct blk_mq_ops nvme_tcp_admin_mq_ops
= {
2099 .queue_rq
= nvme_tcp_queue_rq
,
2100 .complete
= nvme_complete_rq
,
2101 .init_request
= nvme_tcp_init_request
,
2102 .exit_request
= nvme_tcp_exit_request
,
2103 .init_hctx
= nvme_tcp_init_admin_hctx
,
2104 .timeout
= nvme_tcp_timeout
,
2107 static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops
= {
2109 .module
= THIS_MODULE
,
2110 .flags
= NVME_F_FABRICS
,
2111 .reg_read32
= nvmf_reg_read32
,
2112 .reg_read64
= nvmf_reg_read64
,
2113 .reg_write32
= nvmf_reg_write32
,
2114 .free_ctrl
= nvme_tcp_free_ctrl
,
2115 .submit_async_event
= nvme_tcp_submit_async_event
,
2116 .delete_ctrl
= nvme_tcp_delete_ctrl
,
2117 .get_address
= nvmf_get_address
,
2118 .stop_ctrl
= nvme_tcp_stop_ctrl
,
2122 nvme_tcp_existing_controller(struct nvmf_ctrl_options
*opts
)
2124 struct nvme_tcp_ctrl
*ctrl
;
2127 mutex_lock(&nvme_tcp_ctrl_mutex
);
2128 list_for_each_entry(ctrl
, &nvme_tcp_ctrl_list
, list
) {
2129 found
= nvmf_ip_options_match(&ctrl
->ctrl
, opts
);
2133 mutex_unlock(&nvme_tcp_ctrl_mutex
);
2138 static struct nvme_ctrl
*nvme_tcp_create_ctrl(struct device
*dev
,
2139 struct nvmf_ctrl_options
*opts
)
2141 struct nvme_tcp_ctrl
*ctrl
;
2144 ctrl
= kzalloc(sizeof(*ctrl
), GFP_KERNEL
);
2146 return ERR_PTR(-ENOMEM
);
2148 INIT_LIST_HEAD(&ctrl
->list
);
2149 ctrl
->ctrl
.opts
= opts
;
2150 ctrl
->ctrl
.queue_count
= opts
->nr_io_queues
+ opts
->nr_write_queues
+ 1;
2151 ctrl
->ctrl
.sqsize
= opts
->queue_size
- 1;
2152 ctrl
->ctrl
.kato
= opts
->kato
;
2154 INIT_DELAYED_WORK(&ctrl
->connect_work
,
2155 nvme_tcp_reconnect_ctrl_work
);
2156 INIT_WORK(&ctrl
->err_work
, nvme_tcp_error_recovery_work
);
2157 INIT_WORK(&ctrl
->ctrl
.reset_work
, nvme_reset_ctrl_work
);
2159 if (!(opts
->mask
& NVMF_OPT_TRSVCID
)) {
2161 kstrdup(__stringify(NVME_TCP_DISC_PORT
), GFP_KERNEL
);
2162 if (!opts
->trsvcid
) {
2166 opts
->mask
|= NVMF_OPT_TRSVCID
;
2169 ret
= inet_pton_with_scope(&init_net
, AF_UNSPEC
,
2170 opts
->traddr
, opts
->trsvcid
, &ctrl
->addr
);
2172 pr_err("malformed address passed: %s:%s\n",
2173 opts
->traddr
, opts
->trsvcid
);
2177 if (opts
->mask
& NVMF_OPT_HOST_TRADDR
) {
2178 ret
= inet_pton_with_scope(&init_net
, AF_UNSPEC
,
2179 opts
->host_traddr
, NULL
, &ctrl
->src_addr
);
2181 pr_err("malformed src address passed: %s\n",
2187 if (!opts
->duplicate_connect
&& nvme_tcp_existing_controller(opts
)) {
2192 ctrl
->queues
= kcalloc(ctrl
->ctrl
.queue_count
, sizeof(*ctrl
->queues
),
2194 if (!ctrl
->queues
) {
2199 ret
= nvme_init_ctrl(&ctrl
->ctrl
, dev
, &nvme_tcp_ctrl_ops
, 0);
2201 goto out_kfree_queues
;
2203 if (!nvme_change_ctrl_state(&ctrl
->ctrl
, NVME_CTRL_CONNECTING
)) {
2206 goto out_uninit_ctrl
;
2209 ret
= nvme_tcp_setup_ctrl(&ctrl
->ctrl
, true);
2211 goto out_uninit_ctrl
;
2213 dev_info(ctrl
->ctrl
.device
, "new ctrl: NQN \"%s\", addr %pISp\n",
2214 ctrl
->ctrl
.opts
->subsysnqn
, &ctrl
->addr
);
2216 nvme_get_ctrl(&ctrl
->ctrl
);
2218 mutex_lock(&nvme_tcp_ctrl_mutex
);
2219 list_add_tail(&ctrl
->list
, &nvme_tcp_ctrl_list
);
2220 mutex_unlock(&nvme_tcp_ctrl_mutex
);
2225 nvme_uninit_ctrl(&ctrl
->ctrl
);
2226 nvme_put_ctrl(&ctrl
->ctrl
);
2229 return ERR_PTR(ret
);
2231 kfree(ctrl
->queues
);
2234 return ERR_PTR(ret
);
2237 static struct nvmf_transport_ops nvme_tcp_transport
= {
2239 .module
= THIS_MODULE
,
2240 .required_opts
= NVMF_OPT_TRADDR
,
2241 .allowed_opts
= NVMF_OPT_TRSVCID
| NVMF_OPT_RECONNECT_DELAY
|
2242 NVMF_OPT_HOST_TRADDR
| NVMF_OPT_CTRL_LOSS_TMO
|
2243 NVMF_OPT_HDR_DIGEST
| NVMF_OPT_DATA_DIGEST
|
2244 NVMF_OPT_NR_WRITE_QUEUES
,
2245 .create_ctrl
= nvme_tcp_create_ctrl
,
2248 static int __init
nvme_tcp_init_module(void)
2250 nvme_tcp_wq
= alloc_workqueue("nvme_tcp_wq",
2251 WQ_MEM_RECLAIM
| WQ_HIGHPRI
, 0);
2255 nvmf_register_transport(&nvme_tcp_transport
);
2259 static void __exit
nvme_tcp_cleanup_module(void)
2261 struct nvme_tcp_ctrl
*ctrl
;
2263 nvmf_unregister_transport(&nvme_tcp_transport
);
2265 mutex_lock(&nvme_tcp_ctrl_mutex
);
2266 list_for_each_entry(ctrl
, &nvme_tcp_ctrl_list
, list
)
2267 nvme_delete_ctrl(&ctrl
->ctrl
);
2268 mutex_unlock(&nvme_tcp_ctrl_mutex
);
2269 flush_workqueue(nvme_delete_wq
);
2271 destroy_workqueue(nvme_tcp_wq
);
2274 module_init(nvme_tcp_init_module
);
2275 module_exit(nvme_tcp_cleanup_module
);
2277 MODULE_LICENSE("GPL v2");