4 * Copyright (c) Intel Corporation.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 * NVMe over RDMA transport
38 #include "spdk/stdinc.h"
40 #include <infiniband/verbs.h>
41 #include <rdma/rdma_cma.h>
42 #include <rdma/rdma_verbs.h>
44 #include "spdk/assert.h"
46 #include "spdk/trace.h"
47 #include "spdk/event.h"
48 #include "spdk/queue.h"
49 #include "spdk/nvme.h"
50 #include "spdk/nvmf_spec.h"
51 #include "spdk/string.h"
52 #include "spdk/endian.h"
53 #include "spdk/likely.h"
55 #include "nvme_internal.h"
57 #define NVME_RDMA_TIME_OUT_IN_MS 2000
58 #define NVME_RDMA_RW_BUFFER_SIZE 131072
61 * NVME RDMA qpair Resource Defaults
63 #define NVME_RDMA_DEFAULT_TX_SGE 2
64 #define NVME_RDMA_DEFAULT_RX_SGE 1
67 /* Max number of NVMe-oF SGL descriptors supported by the host */
68 #define NVME_RDMA_MAX_SGL_DESCRIPTORS 16
69 struct spdk_nvmf_cmd
{
70 struct spdk_nvme_cmd cmd
;
71 struct spdk_nvme_sgl_descriptor sgl
[NVME_RDMA_MAX_SGL_DESCRIPTORS
];
74 struct spdk_nvme_rdma_hooks g_nvme_hooks
= {};
76 /* Mapping from virtual address to ibv_mr pointer for a protection domain */
77 struct spdk_nvme_rdma_mr_map
{
79 struct spdk_mem_map
*map
;
81 LIST_ENTRY(spdk_nvme_rdma_mr_map
) link
;
84 /* NVMe RDMA transport extensions for spdk_nvme_ctrlr */
85 struct nvme_rdma_ctrlr
{
86 struct spdk_nvme_ctrlr ctrlr
;
91 /* NVMe RDMA qpair extensions for spdk_nvme_qpair */
92 struct nvme_rdma_qpair
{
93 struct spdk_nvme_qpair qpair
;
95 struct rdma_cm_id
*cm_id
;
99 struct spdk_nvme_rdma_req
*rdma_reqs
;
101 uint32_t max_send_sge
;
103 uint32_t max_recv_sge
;
105 uint16_t num_entries
;
107 /* Parallel arrays of response buffers + response SGLs of size num_entries */
108 struct ibv_sge
*rsp_sgls
;
109 struct spdk_nvme_cpl
*rsps
;
111 struct ibv_recv_wr
*rsp_recv_wrs
;
113 /* Memory region describing all rsps for this qpair */
114 struct ibv_mr
*rsp_mr
;
117 * Array of num_entries NVMe commands registered as RDMA message buffers.
118 * Indexed by rdma_req->id.
120 struct spdk_nvmf_cmd
*cmds
;
122 /* Memory region describing all cmds for this qpair */
123 struct ibv_mr
*cmd_mr
;
125 struct spdk_nvme_rdma_mr_map
*mr_map
;
127 TAILQ_HEAD(, spdk_nvme_rdma_req
) free_reqs
;
128 TAILQ_HEAD(, spdk_nvme_rdma_req
) outstanding_reqs
;
130 /* Placed at the end of the struct since it is not used frequently */
131 struct rdma_event_channel
*cm_channel
;
134 struct spdk_nvme_rdma_req
{
137 struct ibv_send_wr send_wr
;
139 struct nvme_request
*req
;
141 struct ibv_sge send_sgl
[NVME_RDMA_DEFAULT_TX_SGE
];
143 TAILQ_ENTRY(spdk_nvme_rdma_req
) link
;
145 bool request_ready_to_put
;
148 static const char *rdma_cm_event_str
[] = {
149 "RDMA_CM_EVENT_ADDR_RESOLVED",
150 "RDMA_CM_EVENT_ADDR_ERROR",
151 "RDMA_CM_EVENT_ROUTE_RESOLVED",
152 "RDMA_CM_EVENT_ROUTE_ERROR",
153 "RDMA_CM_EVENT_CONNECT_REQUEST",
154 "RDMA_CM_EVENT_CONNECT_RESPONSE",
155 "RDMA_CM_EVENT_CONNECT_ERROR",
156 "RDMA_CM_EVENT_UNREACHABLE",
157 "RDMA_CM_EVENT_REJECTED",
158 "RDMA_CM_EVENT_ESTABLISHED",
159 "RDMA_CM_EVENT_DISCONNECTED",
160 "RDMA_CM_EVENT_DEVICE_REMOVAL",
161 "RDMA_CM_EVENT_MULTICAST_JOIN",
162 "RDMA_CM_EVENT_MULTICAST_ERROR",
163 "RDMA_CM_EVENT_ADDR_CHANGE",
164 "RDMA_CM_EVENT_TIMEWAIT_EXIT"
167 static LIST_HEAD(, spdk_nvme_rdma_mr_map
) g_rdma_mr_maps
= LIST_HEAD_INITIALIZER(&g_rdma_mr_maps
);
168 static pthread_mutex_t g_rdma_mr_maps_mutex
= PTHREAD_MUTEX_INITIALIZER
;
170 static int nvme_rdma_qpair_destroy(struct spdk_nvme_qpair
*qpair
);
172 static inline struct nvme_rdma_qpair
*
173 nvme_rdma_qpair(struct spdk_nvme_qpair
*qpair
)
175 assert(qpair
->trtype
== SPDK_NVME_TRANSPORT_RDMA
);
176 return SPDK_CONTAINEROF(qpair
, struct nvme_rdma_qpair
, qpair
);
179 static inline struct nvme_rdma_ctrlr
*
180 nvme_rdma_ctrlr(struct spdk_nvme_ctrlr
*ctrlr
)
182 assert(ctrlr
->trid
.trtype
== SPDK_NVME_TRANSPORT_RDMA
);
183 return SPDK_CONTAINEROF(ctrlr
, struct nvme_rdma_ctrlr
, ctrlr
);
186 static struct spdk_nvme_rdma_req
*
187 nvme_rdma_req_get(struct nvme_rdma_qpair
*rqpair
)
189 struct spdk_nvme_rdma_req
*rdma_req
;
191 rdma_req
= TAILQ_FIRST(&rqpair
->free_reqs
);
193 TAILQ_REMOVE(&rqpair
->free_reqs
, rdma_req
, link
);
194 TAILQ_INSERT_TAIL(&rqpair
->outstanding_reqs
, rdma_req
, link
);
201 nvme_rdma_req_put(struct nvme_rdma_qpair
*rqpair
, struct spdk_nvme_rdma_req
*rdma_req
)
203 rdma_req
->request_ready_to_put
= false;
204 TAILQ_REMOVE(&rqpair
->outstanding_reqs
, rdma_req
, link
);
205 TAILQ_INSERT_HEAD(&rqpair
->free_reqs
, rdma_req
, link
);
209 nvme_rdma_req_complete(struct nvme_request
*req
,
210 struct spdk_nvme_cpl
*rsp
)
212 nvme_complete_request(req
->cb_fn
, req
->cb_arg
, req
->qpair
, req
, rsp
);
213 nvme_free_request(req
);
217 nvme_rdma_cm_event_str_get(uint32_t event
)
219 if (event
< SPDK_COUNTOF(rdma_cm_event_str
)) {
220 return rdma_cm_event_str
[event
];
226 static struct rdma_cm_event
*
227 nvme_rdma_get_event(struct rdma_event_channel
*channel
,
228 enum rdma_cm_event_type evt
)
230 struct rdma_cm_event
*event
;
233 rc
= rdma_get_cm_event(channel
, &event
);
235 SPDK_ERRLOG("Failed to get event from CM event channel. Error %d (%s)\n",
236 errno
, spdk_strerror(errno
));
240 if (event
->event
!= evt
) {
241 SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n",
242 nvme_rdma_cm_event_str_get(evt
),
243 nvme_rdma_cm_event_str_get(event
->event
), event
->event
, event
->status
);
244 rdma_ack_cm_event(event
);
252 nvme_rdma_qpair_init(struct nvme_rdma_qpair
*rqpair
)
255 struct ibv_qp_init_attr attr
;
256 struct ibv_device_attr dev_attr
;
257 struct nvme_rdma_ctrlr
*rctrlr
;
259 rc
= ibv_query_device(rqpair
->cm_id
->verbs
, &dev_attr
);
261 SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
265 rqpair
->cq
= ibv_create_cq(rqpair
->cm_id
->verbs
, rqpair
->num_entries
* 2, rqpair
, NULL
, 0);
267 SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno
, spdk_strerror(errno
));
271 rctrlr
= nvme_rdma_ctrlr(rqpair
->qpair
.ctrlr
);
272 if (g_nvme_hooks
.get_ibv_pd
) {
273 rctrlr
->pd
= g_nvme_hooks
.get_ibv_pd(&rctrlr
->ctrlr
.trid
, rqpair
->cm_id
->verbs
);
278 memset(&attr
, 0, sizeof(struct ibv_qp_init_attr
));
279 attr
.qp_type
= IBV_QPT_RC
;
280 attr
.send_cq
= rqpair
->cq
;
281 attr
.recv_cq
= rqpair
->cq
;
282 attr
.cap
.max_send_wr
= rqpair
->num_entries
; /* SEND operations */
283 attr
.cap
.max_recv_wr
= rqpair
->num_entries
; /* RECV operations */
284 attr
.cap
.max_send_sge
= spdk_min(NVME_RDMA_DEFAULT_TX_SGE
, dev_attr
.max_sge
);
285 attr
.cap
.max_recv_sge
= spdk_min(NVME_RDMA_DEFAULT_RX_SGE
, dev_attr
.max_sge
);
287 rc
= rdma_create_qp(rqpair
->cm_id
, rctrlr
->pd
, &attr
);
290 SPDK_ERRLOG("rdma_create_qp failed\n");
294 /* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */
295 rqpair
->max_send_sge
= spdk_min(NVME_RDMA_DEFAULT_TX_SGE
, attr
.cap
.max_send_sge
);
296 rqpair
->max_recv_sge
= spdk_min(NVME_RDMA_DEFAULT_RX_SGE
, attr
.cap
.max_recv_sge
);
298 rctrlr
->pd
= rqpair
->cm_id
->qp
->pd
;
300 rqpair
->cm_id
->context
= &rqpair
->qpair
;
305 #define nvme_rdma_trace_ibv_sge(sg_list) \
307 SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \
308 (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \
312 nvme_rdma_post_recv(struct nvme_rdma_qpair
*rqpair
, uint16_t rsp_idx
)
314 struct ibv_recv_wr
*wr
, *bad_wr
= NULL
;
317 wr
= &rqpair
->rsp_recv_wrs
[rsp_idx
];
318 nvme_rdma_trace_ibv_sge(wr
->sg_list
);
320 rc
= ibv_post_recv(rqpair
->cm_id
->qp
, wr
, &bad_wr
);
322 SPDK_ERRLOG("Failure posting rdma recv, rc = 0x%x\n", rc
);
329 nvme_rdma_unregister_rsps(struct nvme_rdma_qpair
*rqpair
)
331 if (rqpair
->rsp_mr
&& rdma_dereg_mr(rqpair
->rsp_mr
)) {
332 SPDK_ERRLOG("Unable to de-register rsp_mr\n");
334 rqpair
->rsp_mr
= NULL
;
338 nvme_rdma_free_rsps(struct nvme_rdma_qpair
*rqpair
)
342 free(rqpair
->rsp_sgls
);
343 rqpair
->rsp_sgls
= NULL
;
344 free(rqpair
->rsp_recv_wrs
);
345 rqpair
->rsp_recv_wrs
= NULL
;
349 nvme_rdma_alloc_rsps(struct nvme_rdma_qpair
*rqpair
)
352 rqpair
->rsp_recv_wrs
= NULL
;
354 rqpair
->rsp_sgls
= calloc(rqpair
->num_entries
, sizeof(*rqpair
->rsp_sgls
));
355 if (!rqpair
->rsp_sgls
) {
356 SPDK_ERRLOG("Failed to allocate rsp_sgls\n");
360 rqpair
->rsp_recv_wrs
= calloc(rqpair
->num_entries
,
361 sizeof(*rqpair
->rsp_recv_wrs
));
362 if (!rqpair
->rsp_recv_wrs
) {
363 SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n");
367 rqpair
->rsps
= calloc(rqpair
->num_entries
, sizeof(*rqpair
->rsps
));
369 SPDK_ERRLOG("can not allocate rdma rsps\n");
375 nvme_rdma_free_rsps(rqpair
);
380 nvme_rdma_register_rsps(struct nvme_rdma_qpair
*rqpair
)
384 rqpair
->rsp_mr
= rdma_reg_msgs(rqpair
->cm_id
, rqpair
->rsps
,
385 rqpair
->num_entries
* sizeof(*rqpair
->rsps
));
386 if (rqpair
->rsp_mr
== NULL
) {
387 SPDK_ERRLOG("Unable to register rsp_mr\n");
391 for (i
= 0; i
< rqpair
->num_entries
; i
++) {
392 struct ibv_sge
*rsp_sgl
= &rqpair
->rsp_sgls
[i
];
394 rsp_sgl
->addr
= (uint64_t)&rqpair
->rsps
[i
];
395 rsp_sgl
->length
= sizeof(rqpair
->rsps
[i
]);
396 rsp_sgl
->lkey
= rqpair
->rsp_mr
->lkey
;
398 rqpair
->rsp_recv_wrs
[i
].wr_id
= i
;
399 rqpair
->rsp_recv_wrs
[i
].next
= NULL
;
400 rqpair
->rsp_recv_wrs
[i
].sg_list
= rsp_sgl
;
401 rqpair
->rsp_recv_wrs
[i
].num_sge
= 1;
403 if (nvme_rdma_post_recv(rqpair
, i
)) {
404 SPDK_ERRLOG("Unable to post connection rx desc\n");
412 nvme_rdma_unregister_rsps(rqpair
);
417 nvme_rdma_unregister_reqs(struct nvme_rdma_qpair
*rqpair
)
419 if (rqpair
->cmd_mr
&& rdma_dereg_mr(rqpair
->cmd_mr
)) {
420 SPDK_ERRLOG("Unable to de-register cmd_mr\n");
422 rqpair
->cmd_mr
= NULL
;
426 nvme_rdma_free_reqs(struct nvme_rdma_qpair
*rqpair
)
428 if (!rqpair
->rdma_reqs
) {
435 free(rqpair
->rdma_reqs
);
436 rqpair
->rdma_reqs
= NULL
;
440 nvme_rdma_alloc_reqs(struct nvme_rdma_qpair
*rqpair
)
442 rqpair
->rdma_reqs
= calloc(rqpair
->num_entries
, sizeof(struct spdk_nvme_rdma_req
));
443 if (rqpair
->rdma_reqs
== NULL
) {
444 SPDK_ERRLOG("Failed to allocate rdma_reqs\n");
448 rqpair
->cmds
= calloc(rqpair
->num_entries
, sizeof(*rqpair
->cmds
));
450 SPDK_ERRLOG("Failed to allocate RDMA cmds\n");
456 nvme_rdma_free_reqs(rqpair
);
461 nvme_rdma_register_reqs(struct nvme_rdma_qpair
*rqpair
)
465 rqpair
->cmd_mr
= rdma_reg_msgs(rqpair
->cm_id
, rqpair
->cmds
,
466 rqpair
->num_entries
* sizeof(*rqpair
->cmds
));
467 if (!rqpair
->cmd_mr
) {
468 SPDK_ERRLOG("Unable to register cmd_mr\n");
472 TAILQ_INIT(&rqpair
->free_reqs
);
473 TAILQ_INIT(&rqpair
->outstanding_reqs
);
474 for (i
= 0; i
< rqpair
->num_entries
; i
++) {
475 struct spdk_nvme_rdma_req
*rdma_req
;
476 struct spdk_nvmf_cmd
*cmd
;
478 rdma_req
= &rqpair
->rdma_reqs
[i
];
479 cmd
= &rqpair
->cmds
[i
];
483 /* The first RDMA sgl element will always point
484 * at this data structure. Depending on whether
485 * an NVMe-oF SGL is required, the length of
486 * this element may change. */
487 rdma_req
->send_sgl
[0].addr
= (uint64_t)cmd
;
488 rdma_req
->send_sgl
[0].lkey
= rqpair
->cmd_mr
->lkey
;
490 rdma_req
->send_wr
.wr_id
= (uint64_t)rdma_req
;
491 rdma_req
->send_wr
.next
= NULL
;
492 rdma_req
->send_wr
.opcode
= IBV_WR_SEND
;
493 rdma_req
->send_wr
.send_flags
= IBV_SEND_SIGNALED
;
494 rdma_req
->send_wr
.sg_list
= rdma_req
->send_sgl
;
495 rdma_req
->send_wr
.imm_data
= 0;
497 TAILQ_INSERT_TAIL(&rqpair
->free_reqs
, rdma_req
, link
);
503 nvme_rdma_unregister_reqs(rqpair
);
508 nvme_rdma_recv(struct nvme_rdma_qpair
*rqpair
, uint64_t rsp_idx
)
510 struct spdk_nvme_qpair
*qpair
= &rqpair
->qpair
;
511 struct spdk_nvme_rdma_req
*rdma_req
;
512 struct spdk_nvme_cpl
*rsp
;
513 struct nvme_request
*req
;
515 assert(rsp_idx
< rqpair
->num_entries
);
516 rsp
= &rqpair
->rsps
[rsp_idx
];
517 rdma_req
= &rqpair
->rdma_reqs
[rsp
->cid
];
520 nvme_rdma_req_complete(req
, rsp
);
522 if (rdma_req
->request_ready_to_put
) {
523 nvme_rdma_req_put(rqpair
, rdma_req
);
525 rdma_req
->request_ready_to_put
= true;
528 if (nvme_rdma_post_recv(rqpair
, rsp_idx
)) {
529 SPDK_ERRLOG("Unable to re-post rx descriptor\n");
533 if (!STAILQ_EMPTY(&qpair
->queued_req
) && !qpair
->ctrlr
->is_resetting
) {
534 req
= STAILQ_FIRST(&qpair
->queued_req
);
535 STAILQ_REMOVE_HEAD(&qpair
->queued_req
, stailq
);
536 nvme_qpair_submit_request(qpair
, req
);
543 nvme_rdma_resolve_addr(struct nvme_rdma_qpair
*rqpair
,
544 struct sockaddr
*src_addr
,
545 struct sockaddr
*dst_addr
,
546 struct rdma_event_channel
*cm_channel
)
549 struct rdma_cm_event
*event
;
551 ret
= rdma_resolve_addr(rqpair
->cm_id
, src_addr
, dst_addr
,
552 NVME_RDMA_TIME_OUT_IN_MS
);
554 SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno
);
558 event
= nvme_rdma_get_event(cm_channel
, RDMA_CM_EVENT_ADDR_RESOLVED
);
560 SPDK_ERRLOG("RDMA address resolution error\n");
563 rdma_ack_cm_event(event
);
565 ret
= rdma_resolve_route(rqpair
->cm_id
, NVME_RDMA_TIME_OUT_IN_MS
);
567 SPDK_ERRLOG("rdma_resolve_route\n");
571 event
= nvme_rdma_get_event(cm_channel
, RDMA_CM_EVENT_ROUTE_RESOLVED
);
573 SPDK_ERRLOG("RDMA route resolution error\n");
576 rdma_ack_cm_event(event
);
582 nvme_rdma_connect(struct nvme_rdma_qpair
*rqpair
)
584 struct rdma_conn_param param
= {};
585 struct spdk_nvmf_rdma_request_private_data request_data
= {};
586 struct spdk_nvmf_rdma_accept_private_data
*accept_data
;
587 struct ibv_device_attr attr
;
589 struct rdma_cm_event
*event
;
590 struct spdk_nvme_ctrlr
*ctrlr
;
592 ret
= ibv_query_device(rqpair
->cm_id
->verbs
, &attr
);
594 SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
598 param
.responder_resources
= spdk_min(rqpair
->num_entries
, attr
.max_qp_rd_atom
);
600 ctrlr
= rqpair
->qpair
.ctrlr
;
605 request_data
.qid
= rqpair
->qpair
.id
;
606 request_data
.hrqsize
= rqpair
->num_entries
;
607 request_data
.hsqsize
= rqpair
->num_entries
- 1;
608 request_data
.cntlid
= ctrlr
->cntlid
;
610 param
.private_data
= &request_data
;
611 param
.private_data_len
= sizeof(request_data
);
612 param
.retry_count
= 7;
613 param
.rnr_retry_count
= 7;
615 ret
= rdma_connect(rqpair
->cm_id
, ¶m
);
617 SPDK_ERRLOG("nvme rdma connect error\n");
621 event
= nvme_rdma_get_event(rqpair
->cm_channel
, RDMA_CM_EVENT_ESTABLISHED
);
623 SPDK_ERRLOG("RDMA connect error\n");
627 accept_data
= (struct spdk_nvmf_rdma_accept_private_data
*)event
->param
.conn
.private_data
;
628 if (accept_data
== NULL
) {
629 rdma_ack_cm_event(event
);
630 SPDK_ERRLOG("NVMe-oF target did not return accept data\n");
634 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "Requested queue depth %d. Actually got queue depth %d.\n",
635 rqpair
->num_entries
, accept_data
->crqsize
);
637 rqpair
->num_entries
= spdk_min(rqpair
->num_entries
, accept_data
->crqsize
);
639 rdma_ack_cm_event(event
);
645 nvme_rdma_parse_addr(struct sockaddr_storage
*sa
, int family
, const char *addr
, const char *service
)
647 struct addrinfo
*res
;
648 struct addrinfo hints
;
651 memset(&hints
, 0, sizeof(hints
));
652 hints
.ai_family
= family
;
653 hints
.ai_socktype
= SOCK_STREAM
;
654 hints
.ai_protocol
= 0;
656 ret
= getaddrinfo(addr
, service
, &hints
, &res
);
658 SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret
), ret
);
662 if (res
->ai_addrlen
> sizeof(*sa
)) {
663 SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res
->ai_addrlen
);
666 memcpy(sa
, res
->ai_addr
, res
->ai_addrlen
);
674 nvme_rdma_mr_map_notify(void *cb_ctx
, struct spdk_mem_map
*map
,
675 enum spdk_mem_map_notify_action action
,
676 void *vaddr
, size_t size
)
678 struct ibv_pd
*pd
= cb_ctx
;
683 case SPDK_MEM_MAP_NOTIFY_REGISTER
:
684 if (!g_nvme_hooks
.get_rkey
) {
685 mr
= ibv_reg_mr(pd
, vaddr
, size
,
686 IBV_ACCESS_LOCAL_WRITE
|
687 IBV_ACCESS_REMOTE_READ
|
688 IBV_ACCESS_REMOTE_WRITE
);
690 SPDK_ERRLOG("ibv_reg_mr() failed\n");
693 rc
= spdk_mem_map_set_translation(map
, (uint64_t)vaddr
, size
, (uint64_t)mr
);
696 rc
= spdk_mem_map_set_translation(map
, (uint64_t)vaddr
, size
,
697 g_nvme_hooks
.get_rkey(pd
, vaddr
, size
));
700 case SPDK_MEM_MAP_NOTIFY_UNREGISTER
:
701 if (!g_nvme_hooks
.get_rkey
) {
702 mr
= (struct ibv_mr
*)spdk_mem_map_translate(map
, (uint64_t)vaddr
, NULL
);
707 rc
= spdk_mem_map_clear_translation(map
, (uint64_t)vaddr
, size
);
717 nvme_rdma_check_contiguous_entries(uint64_t addr_1
, uint64_t addr_2
)
719 /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */
720 return addr_1
== addr_2
;
724 nvme_rdma_register_mem(struct nvme_rdma_qpair
*rqpair
)
726 struct ibv_pd
*pd
= rqpair
->cm_id
->qp
->pd
;
727 struct spdk_nvme_rdma_mr_map
*mr_map
;
728 const struct spdk_mem_map_ops nvme_rdma_map_ops
= {
729 .notify_cb
= nvme_rdma_mr_map_notify
,
730 .are_contiguous
= nvme_rdma_check_contiguous_entries
733 pthread_mutex_lock(&g_rdma_mr_maps_mutex
);
735 /* Look up existing mem map registration for this pd */
736 LIST_FOREACH(mr_map
, &g_rdma_mr_maps
, link
) {
737 if (mr_map
->pd
== pd
) {
739 rqpair
->mr_map
= mr_map
;
740 pthread_mutex_unlock(&g_rdma_mr_maps_mutex
);
745 mr_map
= calloc(1, sizeof(*mr_map
));
746 if (mr_map
== NULL
) {
747 SPDK_ERRLOG("calloc() failed\n");
748 pthread_mutex_unlock(&g_rdma_mr_maps_mutex
);
754 mr_map
->map
= spdk_mem_map_alloc((uint64_t)NULL
, &nvme_rdma_map_ops
, pd
);
755 if (mr_map
->map
== NULL
) {
756 SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
758 pthread_mutex_unlock(&g_rdma_mr_maps_mutex
);
762 rqpair
->mr_map
= mr_map
;
763 LIST_INSERT_HEAD(&g_rdma_mr_maps
, mr_map
, link
);
765 pthread_mutex_unlock(&g_rdma_mr_maps_mutex
);
771 nvme_rdma_unregister_mem(struct nvme_rdma_qpair
*rqpair
)
773 struct spdk_nvme_rdma_mr_map
*mr_map
;
775 mr_map
= rqpair
->mr_map
;
776 rqpair
->mr_map
= NULL
;
778 if (mr_map
== NULL
) {
782 pthread_mutex_lock(&g_rdma_mr_maps_mutex
);
784 assert(mr_map
->ref
> 0);
786 if (mr_map
->ref
== 0) {
787 LIST_REMOVE(mr_map
, link
);
788 spdk_mem_map_free(&mr_map
->map
);
792 pthread_mutex_unlock(&g_rdma_mr_maps_mutex
);
796 nvme_rdma_qpair_connect(struct nvme_rdma_qpair
*rqpair
)
798 struct sockaddr_storage dst_addr
;
799 struct sockaddr_storage src_addr
;
800 bool src_addr_specified
;
802 struct spdk_nvme_ctrlr
*ctrlr
;
805 rqpair
->cm_channel
= rdma_create_event_channel();
806 if (rqpair
->cm_channel
== NULL
) {
807 SPDK_ERRLOG("rdma_create_event_channel() failed\n");
811 ctrlr
= rqpair
->qpair
.ctrlr
;
813 switch (ctrlr
->trid
.adrfam
) {
814 case SPDK_NVMF_ADRFAM_IPV4
:
817 case SPDK_NVMF_ADRFAM_IPV6
:
821 SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr
->trid
.adrfam
);
825 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "adrfam %d ai_family %d\n", ctrlr
->trid
.adrfam
, family
);
827 memset(&dst_addr
, 0, sizeof(dst_addr
));
829 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "trsvcid is %s\n", ctrlr
->trid
.trsvcid
);
830 rc
= nvme_rdma_parse_addr(&dst_addr
, family
, ctrlr
->trid
.traddr
, ctrlr
->trid
.trsvcid
);
832 SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n");
836 if (ctrlr
->opts
.src_addr
[0] || ctrlr
->opts
.src_svcid
[0]) {
837 memset(&src_addr
, 0, sizeof(src_addr
));
838 rc
= nvme_rdma_parse_addr(&src_addr
, family
, ctrlr
->opts
.src_addr
, ctrlr
->opts
.src_svcid
);
840 SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n");
843 src_addr_specified
= true;
845 src_addr_specified
= false;
848 rc
= rdma_create_id(rqpair
->cm_channel
, &rqpair
->cm_id
, rqpair
, RDMA_PS_TCP
);
850 SPDK_ERRLOG("rdma_create_id() failed\n");
854 rc
= nvme_rdma_resolve_addr(rqpair
,
855 src_addr_specified
? (struct sockaddr
*)&src_addr
: NULL
,
856 (struct sockaddr
*)&dst_addr
, rqpair
->cm_channel
);
858 SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n");
862 rc
= nvme_rdma_qpair_init(rqpair
);
864 SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n");
868 rc
= nvme_rdma_connect(rqpair
);
870 SPDK_ERRLOG("Unable to connect the rqpair\n");
874 rc
= nvme_rdma_register_reqs(rqpair
);
875 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "rc =%d\n", rc
);
877 SPDK_ERRLOG("Unable to register rqpair RDMA requests\n");
880 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "RDMA requests registered\n");
882 rc
= nvme_rdma_register_rsps(rqpair
);
883 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "rc =%d\n", rc
);
885 SPDK_ERRLOG("Unable to register rqpair RDMA responses\n");
888 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "RDMA responses registered\n");
890 rc
= nvme_rdma_register_mem(rqpair
);
892 SPDK_ERRLOG("Unable to register memory for RDMA\n");
896 rc
= nvme_fabric_qpair_connect(&rqpair
->qpair
, rqpair
->num_entries
);
898 SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
906 * Build SGL describing empty payload.
909 nvme_rdma_build_null_request(struct spdk_nvme_rdma_req
*rdma_req
)
911 struct nvme_request
*req
= rdma_req
->req
;
913 req
->cmd
.psdt
= SPDK_NVME_PSDT_SGL_MPTR_CONTIG
;
915 /* The first element of this SGL is pointing at an
916 * spdk_nvmf_cmd object. For this particular command,
917 * we only need the first 64 bytes corresponding to
918 * the NVMe command. */
919 rdma_req
->send_sgl
[0].length
= sizeof(struct spdk_nvme_cmd
);
921 /* The RDMA SGL needs one element describing the NVMe command. */
922 rdma_req
->send_wr
.num_sge
= 1;
924 req
->cmd
.dptr
.sgl1
.keyed
.type
= SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK
;
925 req
->cmd
.dptr
.sgl1
.keyed
.subtype
= SPDK_NVME_SGL_SUBTYPE_ADDRESS
;
926 req
->cmd
.dptr
.sgl1
.keyed
.length
= 0;
927 req
->cmd
.dptr
.sgl1
.keyed
.key
= 0;
928 req
->cmd
.dptr
.sgl1
.address
= 0;
934 * Build inline SGL describing contiguous payload buffer.
937 nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair
*rqpair
,
938 struct spdk_nvme_rdma_req
*rdma_req
)
940 struct nvme_request
*req
= rdma_req
->req
;
943 uint64_t requested_size
;
945 payload
= req
->payload
.contig_or_cb_arg
+ req
->payload_offset
;
946 assert(req
->payload_size
!= 0);
947 assert(nvme_payload_type(&req
->payload
) == NVME_PAYLOAD_TYPE_CONTIG
);
949 requested_size
= req
->payload_size
;
951 if (!g_nvme_hooks
.get_rkey
) {
952 mr
= (struct ibv_mr
*)spdk_mem_map_translate(rqpair
->mr_map
->map
,
953 (uint64_t)payload
, &requested_size
);
955 if (mr
== NULL
|| requested_size
< req
->payload_size
) {
957 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
961 rdma_req
->send_sgl
[1].lkey
= mr
->lkey
;
963 rdma_req
->send_sgl
[1].lkey
= spdk_mem_map_translate(rqpair
->mr_map
->map
,
969 /* The first element of this SGL is pointing at an
970 * spdk_nvmf_cmd object. For this particular command,
971 * we only need the first 64 bytes corresponding to
972 * the NVMe command. */
973 rdma_req
->send_sgl
[0].length
= sizeof(struct spdk_nvme_cmd
);
975 rdma_req
->send_sgl
[1].addr
= (uint64_t)payload
;
976 rdma_req
->send_sgl
[1].length
= (uint32_t)req
->payload_size
;
978 /* The RDMA SGL contains two elements. The first describes
979 * the NVMe command and the second describes the data
981 rdma_req
->send_wr
.num_sge
= 2;
983 req
->cmd
.psdt
= SPDK_NVME_PSDT_SGL_MPTR_CONTIG
;
984 req
->cmd
.dptr
.sgl1
.unkeyed
.type
= SPDK_NVME_SGL_TYPE_DATA_BLOCK
;
985 req
->cmd
.dptr
.sgl1
.unkeyed
.subtype
= SPDK_NVME_SGL_SUBTYPE_OFFSET
;
986 req
->cmd
.dptr
.sgl1
.unkeyed
.length
= (uint32_t)req
->payload_size
;
987 /* Inline only supported for icdoff == 0 currently. This function will
988 * not get called for controllers with other values. */
989 req
->cmd
.dptr
.sgl1
.address
= (uint64_t)0;
995 * Build SGL describing contiguous payload buffer.
998 nvme_rdma_build_contig_request(struct nvme_rdma_qpair
*rqpair
,
999 struct spdk_nvme_rdma_req
*rdma_req
)
1001 struct nvme_request
*req
= rdma_req
->req
;
1002 void *payload
= req
->payload
.contig_or_cb_arg
+ req
->payload_offset
;
1004 uint64_t requested_size
;
1006 assert(req
->payload_size
!= 0);
1007 assert(nvme_payload_type(&req
->payload
) == NVME_PAYLOAD_TYPE_CONTIG
);
1009 requested_size
= req
->payload_size
;
1010 if (!g_nvme_hooks
.get_rkey
) {
1012 mr
= (struct ibv_mr
*)spdk_mem_map_translate(rqpair
->mr_map
->map
, (uint64_t)payload
,
1017 req
->cmd
.dptr
.sgl1
.keyed
.key
= mr
->rkey
;
1019 req
->cmd
.dptr
.sgl1
.keyed
.key
= spdk_mem_map_translate(rqpair
->mr_map
->map
,
1024 if (requested_size
< req
->payload_size
) {
1025 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
1029 /* The first element of this SGL is pointing at an
1030 * spdk_nvmf_cmd object. For this particular command,
1031 * we only need the first 64 bytes corresponding to
1032 * the NVMe command. */
1033 rdma_req
->send_sgl
[0].length
= sizeof(struct spdk_nvme_cmd
);
1035 /* The RDMA SGL needs one element describing the NVMe command. */
1036 rdma_req
->send_wr
.num_sge
= 1;
1038 req
->cmd
.psdt
= SPDK_NVME_PSDT_SGL_MPTR_CONTIG
;
1039 req
->cmd
.dptr
.sgl1
.keyed
.type
= SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK
;
1040 req
->cmd
.dptr
.sgl1
.keyed
.subtype
= SPDK_NVME_SGL_SUBTYPE_ADDRESS
;
1041 req
->cmd
.dptr
.sgl1
.keyed
.length
= req
->payload_size
;
1042 req
->cmd
.dptr
.sgl1
.address
= (uint64_t)payload
;
1048 * Build SGL describing scattered payload buffer.
1051 nvme_rdma_build_sgl_request(struct nvme_rdma_qpair
*rqpair
,
1052 struct spdk_nvme_rdma_req
*rdma_req
)
1054 struct nvme_request
*req
= rdma_req
->req
;
1055 struct spdk_nvmf_cmd
*cmd
= &rqpair
->cmds
[rdma_req
->id
];
1056 struct ibv_mr
*mr
= NULL
;
1058 uint64_t remaining_size
, mr_length
;
1059 uint32_t sge_length
;
1060 int rc
, max_num_sgl
, num_sgl_desc
;
1062 assert(req
->payload_size
!= 0);
1063 assert(nvme_payload_type(&req
->payload
) == NVME_PAYLOAD_TYPE_SGL
);
1064 assert(req
->payload
.reset_sgl_fn
!= NULL
);
1065 assert(req
->payload
.next_sge_fn
!= NULL
);
1066 req
->payload
.reset_sgl_fn(req
->payload
.contig_or_cb_arg
, req
->payload_offset
);
1068 max_num_sgl
= req
->qpair
->ctrlr
->max_sges
;
1070 remaining_size
= req
->payload_size
;
1073 rc
= req
->payload
.next_sge_fn(req
->payload
.contig_or_cb_arg
, &virt_addr
, &sge_length
);
1078 sge_length
= spdk_min(remaining_size
, sge_length
);
1079 mr_length
= sge_length
;
1081 if (!g_nvme_hooks
.get_rkey
) {
1082 mr
= (struct ibv_mr
*)spdk_mem_map_translate(rqpair
->mr_map
->map
,
1083 (uint64_t)virt_addr
,
1088 cmd
->sgl
[num_sgl_desc
].keyed
.key
= mr
->rkey
;
1090 cmd
->sgl
[num_sgl_desc
].keyed
.key
= spdk_mem_map_translate(rqpair
->mr_map
->map
,
1091 (uint64_t)virt_addr
,
1095 if (mr_length
< sge_length
) {
1096 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
1100 cmd
->sgl
[num_sgl_desc
].keyed
.type
= SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK
;
1101 cmd
->sgl
[num_sgl_desc
].keyed
.subtype
= SPDK_NVME_SGL_SUBTYPE_ADDRESS
;
1102 cmd
->sgl
[num_sgl_desc
].keyed
.length
= sge_length
;
1103 cmd
->sgl
[num_sgl_desc
].address
= (uint64_t)virt_addr
;
1105 remaining_size
-= sge_length
;
1107 } while (remaining_size
> 0 && num_sgl_desc
< max_num_sgl
);
1110 /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
1111 if (remaining_size
> 0) {
1115 req
->cmd
.psdt
= SPDK_NVME_PSDT_SGL_MPTR_CONTIG
;
1117 /* The RDMA SGL needs one element describing some portion
1118 * of the spdk_nvmf_cmd structure. */
1119 rdma_req
->send_wr
.num_sge
= 1;
1122 * If only one SGL descriptor is required, it can be embedded directly in the command
1123 * as a data block descriptor.
1125 if (num_sgl_desc
== 1) {
1126 /* The first element of this SGL is pointing at an
1127 * spdk_nvmf_cmd object. For this particular command,
1128 * we only need the first 64 bytes corresponding to
1129 * the NVMe command. */
1130 rdma_req
->send_sgl
[0].length
= sizeof(struct spdk_nvme_cmd
);
1132 req
->cmd
.dptr
.sgl1
.keyed
.type
= cmd
->sgl
[0].keyed
.type
;
1133 req
->cmd
.dptr
.sgl1
.keyed
.subtype
= cmd
->sgl
[0].keyed
.subtype
;
1134 req
->cmd
.dptr
.sgl1
.keyed
.length
= cmd
->sgl
[0].keyed
.length
;
1135 req
->cmd
.dptr
.sgl1
.keyed
.key
= cmd
->sgl
[0].keyed
.key
;
1136 req
->cmd
.dptr
.sgl1
.address
= cmd
->sgl
[0].address
;
1139 * Otherwise, The SGL descriptor embedded in the command must point to the list of
1140 * SGL descriptors used to describe the operation. In that case it is a last segment descriptor.
1142 rdma_req
->send_sgl
[0].length
= sizeof(struct spdk_nvme_cmd
) + sizeof(struct
1143 spdk_nvme_sgl_descriptor
) * num_sgl_desc
;
1145 req
->cmd
.dptr
.sgl1
.unkeyed
.type
= SPDK_NVME_SGL_TYPE_LAST_SEGMENT
;
1146 req
->cmd
.dptr
.sgl1
.unkeyed
.subtype
= SPDK_NVME_SGL_SUBTYPE_OFFSET
;
1147 req
->cmd
.dptr
.sgl1
.unkeyed
.length
= num_sgl_desc
* sizeof(struct spdk_nvme_sgl_descriptor
);
1148 req
->cmd
.dptr
.sgl1
.address
= (uint64_t)0;
1155 * Build inline SGL describing sgl payload buffer.
1158 nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair
*rqpair
,
1159 struct spdk_nvme_rdma_req
*rdma_req
)
1161 struct nvme_request
*req
= rdma_req
->req
;
1164 uint64_t requested_size
;
1168 assert(req
->payload_size
!= 0);
1169 assert(nvme_payload_type(&req
->payload
) == NVME_PAYLOAD_TYPE_SGL
);
1170 assert(req
->payload
.reset_sgl_fn
!= NULL
);
1171 assert(req
->payload
.next_sge_fn
!= NULL
);
1172 req
->payload
.reset_sgl_fn(req
->payload
.contig_or_cb_arg
, req
->payload_offset
);
1174 rc
= req
->payload
.next_sge_fn(req
->payload
.contig_or_cb_arg
, &virt_addr
, &length
);
1179 if (length
< req
->payload_size
) {
1180 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "Inline SGL request split so sending separately.\n");
1181 return nvme_rdma_build_sgl_request(rqpair
, rdma_req
);
1184 if (length
> req
->payload_size
) {
1185 length
= req
->payload_size
;
1188 requested_size
= length
;
1189 mr
= (struct ibv_mr
*)spdk_mem_map_translate(rqpair
->mr_map
->map
, (uint64_t)virt_addr
,
1191 if (mr
== NULL
|| requested_size
< length
) {
1192 for (i
= 1; i
< rdma_req
->send_wr
.num_sge
; i
++) {
1193 rdma_req
->send_sgl
[i
].addr
= 0;
1194 rdma_req
->send_sgl
[i
].length
= 0;
1195 rdma_req
->send_sgl
[i
].lkey
= 0;
1199 SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n");
1204 rdma_req
->send_sgl
[1].addr
= (uint64_t)virt_addr
;
1205 rdma_req
->send_sgl
[1].length
= length
;
1206 rdma_req
->send_sgl
[1].lkey
= mr
->lkey
;
1208 rdma_req
->send_wr
.num_sge
= 2;
1210 /* The first element of this SGL is pointing at an
1211 * spdk_nvmf_cmd object. For this particular command,
1212 * we only need the first 64 bytes corresponding to
1213 * the NVMe command. */
1214 rdma_req
->send_sgl
[0].length
= sizeof(struct spdk_nvme_cmd
);
1216 req
->cmd
.psdt
= SPDK_NVME_PSDT_SGL_MPTR_CONTIG
;
1217 req
->cmd
.dptr
.sgl1
.unkeyed
.type
= SPDK_NVME_SGL_TYPE_DATA_BLOCK
;
1218 req
->cmd
.dptr
.sgl1
.unkeyed
.subtype
= SPDK_NVME_SGL_SUBTYPE_OFFSET
;
1219 req
->cmd
.dptr
.sgl1
.unkeyed
.length
= (uint32_t)req
->payload_size
;
1220 /* Inline only supported for icdoff == 0 currently. This function will
1221 * not get called for controllers with other values. */
1222 req
->cmd
.dptr
.sgl1
.address
= (uint64_t)0;
1227 static inline unsigned int
1228 nvme_rdma_icdsz_bytes(struct spdk_nvme_ctrlr
*ctrlr
)
1230 return (ctrlr
->cdata
.nvmf_specific
.ioccsz
* 16 - sizeof(struct spdk_nvme_cmd
));
1234 nvme_rdma_req_init(struct nvme_rdma_qpair
*rqpair
, struct nvme_request
*req
,
1235 struct spdk_nvme_rdma_req
*rdma_req
)
1237 struct spdk_nvme_ctrlr
*ctrlr
= rqpair
->qpair
.ctrlr
;
1240 rdma_req
->req
= req
;
1241 req
->cmd
.cid
= rdma_req
->id
;
1243 if (req
->payload_size
== 0) {
1244 rc
= nvme_rdma_build_null_request(rdma_req
);
1245 } else if (nvme_payload_type(&req
->payload
) == NVME_PAYLOAD_TYPE_CONTIG
) {
1247 * Check if icdoff is non zero, to avoid interop conflicts with
1248 * targets with non-zero icdoff. Both SPDK and the Linux kernel
1249 * targets use icdoff = 0. For targets with non-zero icdoff, we
1250 * will currently just not use inline data for now.
1252 if (req
->cmd
.opc
== SPDK_NVME_OPC_WRITE
&&
1253 req
->payload_size
<= nvme_rdma_icdsz_bytes(ctrlr
) &&
1254 (ctrlr
->cdata
.nvmf_specific
.icdoff
== 0)) {
1255 rc
= nvme_rdma_build_contig_inline_request(rqpair
, rdma_req
);
1257 rc
= nvme_rdma_build_contig_request(rqpair
, rdma_req
);
1259 } else if (nvme_payload_type(&req
->payload
) == NVME_PAYLOAD_TYPE_SGL
) {
1260 if (req
->cmd
.opc
== SPDK_NVME_OPC_WRITE
&&
1261 req
->payload_size
<= nvme_rdma_icdsz_bytes(ctrlr
) &&
1262 ctrlr
->cdata
.nvmf_specific
.icdoff
== 0) {
1263 rc
= nvme_rdma_build_sgl_inline_request(rqpair
, rdma_req
);
1265 rc
= nvme_rdma_build_sgl_request(rqpair
, rdma_req
);
1275 memcpy(&rqpair
->cmds
[rdma_req
->id
], &req
->cmd
, sizeof(req
->cmd
));
1279 static struct spdk_nvme_qpair
*
1280 nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr
*ctrlr
,
1281 uint16_t qid
, uint32_t qsize
,
1282 enum spdk_nvme_qprio qprio
,
1283 uint32_t num_requests
)
1285 struct nvme_rdma_qpair
*rqpair
;
1286 struct spdk_nvme_qpair
*qpair
;
1289 rqpair
= calloc(1, sizeof(struct nvme_rdma_qpair
));
1291 SPDK_ERRLOG("failed to get create rqpair\n");
1295 rqpair
->num_entries
= qsize
;
1297 qpair
= &rqpair
->qpair
;
1299 rc
= nvme_qpair_init(qpair
, qid
, ctrlr
, qprio
, num_requests
);
1304 rc
= nvme_rdma_alloc_reqs(rqpair
);
1305 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "rc =%d\n", rc
);
1307 SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n");
1310 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "RDMA requests allocated\n");
1312 rc
= nvme_rdma_alloc_rsps(rqpair
);
1313 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "rc =%d\n", rc
);
1315 SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n");
1318 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "RDMA responses allocated\n");
1320 rc
= nvme_rdma_qpair_connect(rqpair
);
1322 nvme_rdma_qpair_destroy(qpair
);
1330 nvme_rdma_qpair_disconnect(struct spdk_nvme_qpair
*qpair
)
1332 struct nvme_rdma_qpair
*rqpair
= nvme_rdma_qpair(qpair
);
1334 nvme_rdma_unregister_mem(rqpair
);
1335 nvme_rdma_unregister_reqs(rqpair
);
1336 nvme_rdma_unregister_rsps(rqpair
);
1338 if (rqpair
->cm_id
) {
1339 if (rqpair
->cm_id
->qp
) {
1340 rdma_destroy_qp(rqpair
->cm_id
);
1342 rdma_destroy_id(rqpair
->cm_id
);
1346 ibv_destroy_cq(rqpair
->cq
);
1349 if (rqpair
->cm_channel
) {
1350 rdma_destroy_event_channel(rqpair
->cm_channel
);
1355 nvme_rdma_qpair_destroy(struct spdk_nvme_qpair
*qpair
)
1357 struct nvme_rdma_qpair
*rqpair
;
1362 nvme_rdma_qpair_disconnect(qpair
);
1363 nvme_rdma_qpair_abort_reqs(qpair
, 1);
1364 nvme_qpair_deinit(qpair
);
1366 rqpair
= nvme_rdma_qpair(qpair
);
1368 nvme_rdma_free_reqs(rqpair
);
1369 nvme_rdma_free_rsps(rqpair
);
1375 struct spdk_nvme_qpair
*
1376 nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr
*ctrlr
, uint16_t qid
,
1377 const struct spdk_nvme_io_qpair_opts
*opts
)
1379 return nvme_rdma_ctrlr_create_qpair(ctrlr
, qid
, opts
->io_queue_size
, opts
->qprio
,
1380 opts
->io_queue_requests
);
1384 nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr
*ctrlr
)
1386 /* do nothing here */
1390 /* This function must only be called while holding g_spdk_nvme_driver->lock */
1392 nvme_rdma_ctrlr_scan(struct spdk_nvme_probe_ctx
*probe_ctx
,
1393 bool direct_connect
)
1395 struct spdk_nvme_ctrlr_opts discovery_opts
;
1396 struct spdk_nvme_ctrlr
*discovery_ctrlr
;
1397 union spdk_nvme_cc_register cc
;
1399 struct nvme_completion_poll_status status
;
1401 if (strcmp(probe_ctx
->trid
.subnqn
, SPDK_NVMF_DISCOVERY_NQN
) != 0) {
1402 /* It is not a discovery_ctrlr info and try to directly connect it */
1403 rc
= nvme_ctrlr_probe(&probe_ctx
->trid
, probe_ctx
, NULL
);
1407 spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts
, sizeof(discovery_opts
));
1408 /* For discovery_ctrlr set the timeout to 0 */
1409 discovery_opts
.keep_alive_timeout_ms
= 0;
1411 discovery_ctrlr
= nvme_rdma_ctrlr_construct(&probe_ctx
->trid
, &discovery_opts
, NULL
);
1412 if (discovery_ctrlr
== NULL
) {
1416 /* TODO: this should be using the normal NVMe controller initialization process */
1419 cc
.bits
.iosqes
= 6; /* SQ entry size == 64 == 2^6 */
1420 cc
.bits
.iocqes
= 4; /* CQ entry size == 16 == 2^4 */
1421 rc
= nvme_transport_ctrlr_set_reg_4(discovery_ctrlr
, offsetof(struct spdk_nvme_registers
, cc
.raw
),
1424 SPDK_ERRLOG("Failed to set cc\n");
1425 nvme_ctrlr_destruct(discovery_ctrlr
);
1429 /* Direct attach through spdk_nvme_connect() API */
1430 if (direct_connect
== true) {
1431 /* get the cdata info */
1432 rc
= nvme_ctrlr_cmd_identify(discovery_ctrlr
, SPDK_NVME_IDENTIFY_CTRLR
, 0, 0,
1433 &discovery_ctrlr
->cdata
, sizeof(discovery_ctrlr
->cdata
),
1434 nvme_completion_poll_cb
, &status
);
1436 SPDK_ERRLOG("Failed to identify cdata\n");
1440 if (spdk_nvme_wait_for_completion(discovery_ctrlr
->adminq
, &status
)) {
1441 SPDK_ERRLOG("nvme_identify_controller failed!\n");
1445 /* Set the ready state to skip the normal init process */
1446 discovery_ctrlr
->state
= NVME_CTRLR_STATE_READY
;
1447 nvme_ctrlr_connected(probe_ctx
, discovery_ctrlr
);
1448 nvme_ctrlr_add_process(discovery_ctrlr
, 0);
1452 rc
= nvme_fabric_ctrlr_discover(discovery_ctrlr
, probe_ctx
);
1453 nvme_ctrlr_destruct(discovery_ctrlr
);
1457 struct spdk_nvme_ctrlr
*nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id
*trid
,
1458 const struct spdk_nvme_ctrlr_opts
*opts
,
1461 struct nvme_rdma_ctrlr
*rctrlr
;
1462 union spdk_nvme_cap_register cap
;
1463 union spdk_nvme_vs_register vs
;
1466 rctrlr
= calloc(1, sizeof(struct nvme_rdma_ctrlr
));
1467 if (rctrlr
== NULL
) {
1468 SPDK_ERRLOG("could not allocate ctrlr\n");
1472 rctrlr
->ctrlr
.trid
.trtype
= SPDK_NVME_TRANSPORT_RDMA
;
1473 rctrlr
->ctrlr
.opts
= *opts
;
1474 memcpy(&rctrlr
->ctrlr
.trid
, trid
, sizeof(rctrlr
->ctrlr
.trid
));
1476 rc
= nvme_ctrlr_construct(&rctrlr
->ctrlr
);
1482 rctrlr
->ctrlr
.adminq
= nvme_rdma_ctrlr_create_qpair(&rctrlr
->ctrlr
, 0,
1483 SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES
, 0, SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES
);
1484 if (!rctrlr
->ctrlr
.adminq
) {
1485 SPDK_ERRLOG("failed to create admin qpair\n");
1486 nvme_rdma_ctrlr_destruct(&rctrlr
->ctrlr
);
1490 if (nvme_ctrlr_get_cap(&rctrlr
->ctrlr
, &cap
)) {
1491 SPDK_ERRLOG("get_cap() failed\n");
1492 nvme_ctrlr_destruct(&rctrlr
->ctrlr
);
1496 if (nvme_ctrlr_get_vs(&rctrlr
->ctrlr
, &vs
)) {
1497 SPDK_ERRLOG("get_vs() failed\n");
1498 nvme_ctrlr_destruct(&rctrlr
->ctrlr
);
1502 if (nvme_ctrlr_add_process(&rctrlr
->ctrlr
, 0) != 0) {
1503 SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
1504 nvme_ctrlr_destruct(&rctrlr
->ctrlr
);
1508 nvme_ctrlr_init_cap(&rctrlr
->ctrlr
, &cap
, &vs
);
1510 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "successfully initialized the nvmf ctrlr\n");
1511 return &rctrlr
->ctrlr
;
1515 nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr
*ctrlr
)
1517 struct nvme_rdma_ctrlr
*rctrlr
= nvme_rdma_ctrlr(ctrlr
);
1519 if (ctrlr
->adminq
) {
1520 nvme_rdma_qpair_destroy(ctrlr
->adminq
);
1523 nvme_ctrlr_destruct_finish(ctrlr
);
1531 nvme_rdma_ctrlr_set_reg_4(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint32_t value
)
1533 return nvme_fabric_ctrlr_set_reg_4(ctrlr
, offset
, value
);
1537 nvme_rdma_ctrlr_set_reg_8(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint64_t value
)
1539 return nvme_fabric_ctrlr_set_reg_8(ctrlr
, offset
, value
);
1543 nvme_rdma_ctrlr_get_reg_4(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint32_t *value
)
1545 return nvme_fabric_ctrlr_get_reg_4(ctrlr
, offset
, value
);
1549 nvme_rdma_ctrlr_get_reg_8(struct spdk_nvme_ctrlr
*ctrlr
, uint32_t offset
, uint64_t *value
)
1551 return nvme_fabric_ctrlr_get_reg_8(ctrlr
, offset
, value
);
1555 nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair
*qpair
,
1556 struct nvme_request
*req
)
1558 struct nvme_rdma_qpair
*rqpair
;
1559 struct spdk_nvme_rdma_req
*rdma_req
;
1560 struct ibv_send_wr
*wr
, *bad_wr
= NULL
;
1563 rqpair
= nvme_rdma_qpair(qpair
);
1564 assert(rqpair
!= NULL
);
1565 assert(req
!= NULL
);
1567 rdma_req
= nvme_rdma_req_get(rqpair
);
1570 * No rdma_req is available, so queue the request to be
1573 STAILQ_INSERT_TAIL(&qpair
->queued_req
, req
, stailq
);
1577 if (nvme_rdma_req_init(rqpair
, req
, rdma_req
)) {
1578 SPDK_ERRLOG("nvme_rdma_req_init() failed\n");
1579 nvme_rdma_req_put(rqpair
, rdma_req
);
1583 wr
= &rdma_req
->send_wr
;
1585 nvme_rdma_trace_ibv_sge(wr
->sg_list
);
1587 rc
= ibv_post_send(rqpair
->cm_id
->qp
, wr
, &bad_wr
);
1589 SPDK_ERRLOG("Failure posting rdma send for NVMf completion: %d (%s)\n", rc
, spdk_strerror(rc
));
1596 nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
)
1598 return nvme_rdma_qpair_destroy(qpair
);
1602 nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
)
1604 return nvme_rdma_qpair_connect(nvme_rdma_qpair(qpair
));
1608 nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr
*ctrlr
, struct spdk_nvme_qpair
*qpair
)
1610 nvme_rdma_qpair_disconnect(qpair
);
1614 nvme_rdma_qpair_reset(struct spdk_nvme_qpair
*qpair
)
1616 /* Currently, doing nothing here */
1621 nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair
*qpair
, uint32_t dnr
)
1623 struct spdk_nvme_rdma_req
*rdma_req
, *tmp
;
1624 struct nvme_request
*req
;
1625 struct spdk_nvme_cpl cpl
;
1626 struct nvme_rdma_qpair
*rqpair
= nvme_rdma_qpair(qpair
);
1628 cpl
.status
.sc
= SPDK_NVME_SC_ABORTED_SQ_DELETION
;
1629 cpl
.status
.sct
= SPDK_NVME_SCT_GENERIC
;
1630 cpl
.status
.dnr
= dnr
;
1632 TAILQ_FOREACH_SAFE(rdma_req
, &rqpair
->outstanding_reqs
, link
, tmp
) {
1633 assert(rdma_req
->req
!= NULL
);
1634 req
= rdma_req
->req
;
1636 nvme_rdma_req_complete(req
, &cpl
);
1637 nvme_rdma_req_put(rqpair
, rdma_req
);
1642 nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair
*qpair
)
1645 struct spdk_nvme_rdma_req
*rdma_req
, *tmp
;
1646 struct nvme_rdma_qpair
*rqpair
= nvme_rdma_qpair(qpair
);
1647 struct spdk_nvme_ctrlr
*ctrlr
= qpair
->ctrlr
;
1648 struct spdk_nvme_ctrlr_process
*active_proc
;
1650 /* Don't check timeouts during controller initialization. */
1651 if (ctrlr
->state
!= NVME_CTRLR_STATE_READY
) {
1655 if (nvme_qpair_is_admin_queue(qpair
)) {
1656 active_proc
= spdk_nvme_ctrlr_get_current_process(ctrlr
);
1658 active_proc
= qpair
->active_proc
;
1661 /* Only check timeouts if the current process has a timeout callback. */
1662 if (active_proc
== NULL
|| active_proc
->timeout_cb_fn
== NULL
) {
1666 t02
= spdk_get_ticks();
1667 TAILQ_FOREACH_SAFE(rdma_req
, &rqpair
->outstanding_reqs
, link
, tmp
) {
1668 assert(rdma_req
->req
!= NULL
);
1670 if (nvme_request_check_timeout(rdma_req
->req
, rdma_req
->id
, active_proc
, t02
)) {
1672 * The requests are in order, so as soon as one has not timed out,
1680 #define MAX_COMPLETIONS_PER_POLL 128
1683 nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair
*qpair
,
1684 uint32_t max_completions
)
1686 struct nvme_rdma_qpair
*rqpair
= nvme_rdma_qpair(qpair
);
1687 struct ibv_wc wc
[MAX_COMPLETIONS_PER_POLL
];
1688 int i
, rc
, batch_size
;
1691 struct spdk_nvme_rdma_req
*rdma_req
;
1693 if (max_completions
== 0) {
1694 max_completions
= rqpair
->num_entries
;
1696 max_completions
= spdk_min(max_completions
, rqpair
->num_entries
);
1703 batch_size
= spdk_min((max_completions
- reaped
),
1704 MAX_COMPLETIONS_PER_POLL
);
1705 rc
= ibv_poll_cq(cq
, batch_size
, wc
);
1707 SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
1708 errno
, spdk_strerror(errno
));
1710 } else if (rc
== 0) {
1711 /* Ran out of completions */
1715 for (i
= 0; i
< rc
; i
++) {
1717 SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n",
1718 qpair
, wc
[i
].wr_id
, wc
[i
].status
, ibv_wc_status_str(wc
[i
].status
));
1722 switch (wc
[i
].opcode
) {
1724 SPDK_DEBUGLOG(SPDK_LOG_NVME
, "CQ recv completion\n");
1728 if (wc
[i
].byte_len
< sizeof(struct spdk_nvme_cpl
)) {
1729 SPDK_ERRLOG("recv length %u less than expected response size\n", wc
[i
].byte_len
);
1733 if (nvme_rdma_recv(rqpair
, wc
[i
].wr_id
)) {
1734 SPDK_ERRLOG("nvme_rdma_recv processing failure\n");
1740 rdma_req
= (struct spdk_nvme_rdma_req
*)wc
[i
].wr_id
;
1742 if (rdma_req
->request_ready_to_put
) {
1743 nvme_rdma_req_put(rqpair
, rdma_req
);
1745 rdma_req
->request_ready_to_put
= true;
1750 SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", wc
[i
].opcode
);
1754 } while (reaped
< max_completions
);
1756 if (spdk_unlikely(rqpair
->qpair
.ctrlr
->timeout_enabled
)) {
1757 nvme_rdma_qpair_check_timeout(qpair
);
1764 nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr
*ctrlr
)
1766 /* Todo, which should get from the NVMF target */
1767 return NVME_RDMA_RW_BUFFER_SIZE
;
1771 nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr
*ctrlr
)
1773 return spdk_min(ctrlr
->cdata
.nvmf_specific
.msdbd
, NVME_RDMA_MAX_SGL_DESCRIPTORS
);
1776 volatile struct spdk_nvme_registers
*
1777 nvme_rdma_ctrlr_get_registers(struct spdk_nvme_ctrlr
*ctrlr
)
1783 nvme_rdma_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr
*ctrlr
, size_t size
)
1789 nvme_rdma_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr
*ctrlr
, void *buf
, size_t size
)
1795 nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair
*qpair
)
1797 struct spdk_nvme_rdma_req
*rdma_req
, *tmp
;
1798 struct nvme_request
*req
;
1799 struct spdk_nvme_cpl cpl
;
1800 struct nvme_rdma_qpair
*rqpair
= nvme_rdma_qpair(qpair
);
1802 cpl
.status
.sc
= SPDK_NVME_SC_ABORTED_SQ_DELETION
;
1803 cpl
.status
.sct
= SPDK_NVME_SCT_GENERIC
;
1805 TAILQ_FOREACH_SAFE(rdma_req
, &rqpair
->outstanding_reqs
, link
, tmp
) {
1806 if (rdma_req
->req
->cmd
.opc
!= SPDK_NVME_OPC_ASYNC_EVENT_REQUEST
) {
1809 assert(rdma_req
->req
!= NULL
);
1810 req
= rdma_req
->req
;
1812 nvme_rdma_req_complete(req
, &cpl
);
1813 nvme_rdma_req_put(rqpair
, rdma_req
);
1818 spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks
*hooks
)
1820 g_nvme_hooks
= *hooks
;