]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /*- |
2 | * BSD LICENSE | |
3 | * | |
4 | * Copyright (c) Intel Corporation. | |
5 | * All rights reserved. | |
6 | * | |
7 | * Redistribution and use in source and binary forms, with or without | |
8 | * modification, are permitted provided that the following conditions | |
9 | * are met: | |
10 | * | |
11 | * * Redistributions of source code must retain the above copyright | |
12 | * notice, this list of conditions and the following disclaimer. | |
13 | * * Redistributions in binary form must reproduce the above copyright | |
14 | * notice, this list of conditions and the following disclaimer in | |
15 | * the documentation and/or other materials provided with the | |
16 | * distribution. | |
17 | * * Neither the name of Intel Corporation nor the names of its | |
18 | * contributors may be used to endorse or promote products derived | |
19 | * from this software without specific prior written permission. | |
20 | * | |
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
32 | */ | |
33 | ||
34 | /* | |
35 | * NVMe over RDMA transport | |
36 | */ | |
37 | ||
11fdf7f2 TL |
38 | #include "spdk/stdinc.h" |
39 | ||
7c673cae FG |
40 | #include <infiniband/verbs.h> |
41 | #include <rdma/rdma_cma.h> | |
42 | #include <rdma/rdma_verbs.h> | |
7c673cae FG |
43 | |
44 | #include "spdk/assert.h" | |
45 | #include "spdk/log.h" | |
46 | #include "spdk/trace.h" | |
47 | #include "spdk/event.h" | |
48 | #include "spdk/queue.h" | |
49 | #include "spdk/nvme.h" | |
50 | #include "spdk/nvmf_spec.h" | |
51 | #include "spdk/string.h" | |
11fdf7f2 TL |
52 | #include "spdk/endian.h" |
53 | #include "spdk/likely.h" | |
7c673cae FG |
54 | |
55 | #include "nvme_internal.h" | |
56 | ||
57 | #define NVME_RDMA_TIME_OUT_IN_MS 2000 | |
58 | #define NVME_RDMA_RW_BUFFER_SIZE 131072 | |
7c673cae FG |
59 | |
60 | /* | |
11fdf7f2 | 61 | * NVME RDMA qpair Resource Defaults |
7c673cae FG |
62 | */ |
63 | #define NVME_RDMA_DEFAULT_TX_SGE 2 | |
64 | #define NVME_RDMA_DEFAULT_RX_SGE 1 | |
65 | ||
11fdf7f2 TL |
66 | |
67 | /* Max number of NVMe-oF SGL descriptors supported by the host */ | |
68 | #define NVME_RDMA_MAX_SGL_DESCRIPTORS 16 | |
69 | struct spdk_nvmf_cmd { | |
70 | struct spdk_nvme_cmd cmd; | |
71 | struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; | |
72 | }; | |
73 | ||
9f95a23c TL |
74 | struct spdk_nvme_rdma_hooks g_nvme_hooks = {}; |
75 | ||
11fdf7f2 TL |
76 | /* Mapping from virtual address to ibv_mr pointer for a protection domain */ |
77 | struct spdk_nvme_rdma_mr_map { | |
78 | struct ibv_pd *pd; | |
79 | struct spdk_mem_map *map; | |
80 | uint64_t ref; | |
81 | LIST_ENTRY(spdk_nvme_rdma_mr_map) link; | |
82 | }; | |
83 | ||
7c673cae FG |
84 | /* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ |
85 | struct nvme_rdma_ctrlr { | |
86 | struct spdk_nvme_ctrlr ctrlr; | |
9f95a23c TL |
87 | |
88 | struct ibv_pd *pd; | |
7c673cae FG |
89 | }; |
90 | ||
91 | /* NVMe RDMA qpair extensions for spdk_nvme_qpair */ | |
92 | struct nvme_rdma_qpair { | |
93 | struct spdk_nvme_qpair qpair; | |
94 | ||
7c673cae FG |
95 | struct rdma_cm_id *cm_id; |
96 | ||
97 | struct ibv_cq *cq; | |
98 | ||
99 | struct spdk_nvme_rdma_req *rdma_reqs; | |
100 | ||
9f95a23c TL |
101 | uint32_t max_send_sge; |
102 | ||
103 | uint32_t max_recv_sge; | |
104 | ||
7c673cae FG |
105 | uint16_t num_entries; |
106 | ||
107 | /* Parallel arrays of response buffers + response SGLs of size num_entries */ | |
108 | struct ibv_sge *rsp_sgls; | |
109 | struct spdk_nvme_cpl *rsps; | |
110 | ||
111 | struct ibv_recv_wr *rsp_recv_wrs; | |
112 | ||
113 | /* Memory region describing all rsps for this qpair */ | |
114 | struct ibv_mr *rsp_mr; | |
115 | ||
116 | /* | |
117 | * Array of num_entries NVMe commands registered as RDMA message buffers. | |
118 | * Indexed by rdma_req->id. | |
119 | */ | |
11fdf7f2 | 120 | struct spdk_nvmf_cmd *cmds; |
7c673cae FG |
121 | |
122 | /* Memory region describing all cmds for this qpair */ | |
123 | struct ibv_mr *cmd_mr; | |
124 | ||
11fdf7f2 | 125 | struct spdk_nvme_rdma_mr_map *mr_map; |
7c673cae | 126 | |
11fdf7f2 TL |
127 | TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs; |
128 | TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs; | |
9f95a23c TL |
129 | |
130 | /* Placed at the end of the struct since it is not used frequently */ | |
131 | struct rdma_event_channel *cm_channel; | |
7c673cae FG |
132 | }; |
133 | ||
134 | struct spdk_nvme_rdma_req { | |
135 | int id; | |
136 | ||
137 | struct ibv_send_wr send_wr; | |
138 | ||
11fdf7f2 TL |
139 | struct nvme_request *req; |
140 | ||
141 | struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE]; | |
7c673cae | 142 | |
11fdf7f2 | 143 | TAILQ_ENTRY(spdk_nvme_rdma_req) link; |
9f95a23c TL |
144 | |
145 | bool request_ready_to_put; | |
11fdf7f2 | 146 | }; |
7c673cae | 147 | |
11fdf7f2 TL |
148 | static const char *rdma_cm_event_str[] = { |
149 | "RDMA_CM_EVENT_ADDR_RESOLVED", | |
150 | "RDMA_CM_EVENT_ADDR_ERROR", | |
151 | "RDMA_CM_EVENT_ROUTE_RESOLVED", | |
152 | "RDMA_CM_EVENT_ROUTE_ERROR", | |
153 | "RDMA_CM_EVENT_CONNECT_REQUEST", | |
154 | "RDMA_CM_EVENT_CONNECT_RESPONSE", | |
155 | "RDMA_CM_EVENT_CONNECT_ERROR", | |
156 | "RDMA_CM_EVENT_UNREACHABLE", | |
157 | "RDMA_CM_EVENT_REJECTED", | |
158 | "RDMA_CM_EVENT_ESTABLISHED", | |
159 | "RDMA_CM_EVENT_DISCONNECTED", | |
160 | "RDMA_CM_EVENT_DEVICE_REMOVAL", | |
161 | "RDMA_CM_EVENT_MULTICAST_JOIN", | |
162 | "RDMA_CM_EVENT_MULTICAST_ERROR", | |
163 | "RDMA_CM_EVENT_ADDR_CHANGE", | |
164 | "RDMA_CM_EVENT_TIMEWAIT_EXIT" | |
7c673cae FG |
165 | }; |
166 | ||
11fdf7f2 TL |
167 | static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps); |
168 | static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; | |
169 | ||
7c673cae FG |
170 | static int nvme_rdma_qpair_destroy(struct spdk_nvme_qpair *qpair); |
171 | ||
172 | static inline struct nvme_rdma_qpair * | |
173 | nvme_rdma_qpair(struct spdk_nvme_qpair *qpair) | |
174 | { | |
175 | assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA); | |
11fdf7f2 | 176 | return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair); |
7c673cae FG |
177 | } |
178 | ||
179 | static inline struct nvme_rdma_ctrlr * | |
180 | nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr) | |
181 | { | |
182 | assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA); | |
11fdf7f2 | 183 | return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr); |
7c673cae FG |
184 | } |
185 | ||
186 | static struct spdk_nvme_rdma_req * | |
187 | nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair) | |
188 | { | |
189 | struct spdk_nvme_rdma_req *rdma_req; | |
190 | ||
11fdf7f2 | 191 | rdma_req = TAILQ_FIRST(&rqpair->free_reqs); |
7c673cae | 192 | if (rdma_req) { |
11fdf7f2 TL |
193 | TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link); |
194 | TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link); | |
7c673cae FG |
195 | } |
196 | ||
197 | return rdma_req; | |
198 | } | |
199 | ||
200 | static void | |
201 | nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) | |
202 | { | |
9f95a23c | 203 | rdma_req->request_ready_to_put = false; |
11fdf7f2 TL |
204 | TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); |
205 | TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link); | |
7c673cae FG |
206 | } |
207 | ||
208 | static void | |
209 | nvme_rdma_req_complete(struct nvme_request *req, | |
210 | struct spdk_nvme_cpl *rsp) | |
211 | { | |
9f95a23c | 212 | nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); |
7c673cae FG |
213 | nvme_free_request(req); |
214 | } | |
215 | ||
11fdf7f2 TL |
216 | static const char * |
217 | nvme_rdma_cm_event_str_get(uint32_t event) | |
218 | { | |
219 | if (event < SPDK_COUNTOF(rdma_cm_event_str)) { | |
220 | return rdma_cm_event_str[event]; | |
221 | } else { | |
222 | return "Undefined"; | |
223 | } | |
224 | } | |
225 | ||
7c673cae FG |
226 | static struct rdma_cm_event * |
227 | nvme_rdma_get_event(struct rdma_event_channel *channel, | |
228 | enum rdma_cm_event_type evt) | |
229 | { | |
230 | struct rdma_cm_event *event; | |
231 | int rc; | |
232 | ||
233 | rc = rdma_get_cm_event(channel, &event); | |
234 | if (rc < 0) { | |
235 | SPDK_ERRLOG("Failed to get event from CM event channel. Error %d (%s)\n", | |
11fdf7f2 | 236 | errno, spdk_strerror(errno)); |
7c673cae FG |
237 | return NULL; |
238 | } | |
239 | ||
240 | if (event->event != evt) { | |
11fdf7f2 TL |
241 | SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n", |
242 | nvme_rdma_cm_event_str_get(evt), | |
243 | nvme_rdma_cm_event_str_get(event->event), event->event, event->status); | |
7c673cae FG |
244 | rdma_ack_cm_event(event); |
245 | return NULL; | |
246 | } | |
247 | ||
248 | return event; | |
249 | } | |
250 | ||
251 | static int | |
252 | nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) | |
253 | { | |
254 | int rc; | |
255 | struct ibv_qp_init_attr attr; | |
9f95a23c TL |
256 | struct ibv_device_attr dev_attr; |
257 | struct nvme_rdma_ctrlr *rctrlr; | |
258 | ||
259 | rc = ibv_query_device(rqpair->cm_id->verbs, &dev_attr); | |
260 | if (rc != 0) { | |
261 | SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); | |
262 | return -1; | |
263 | } | |
7c673cae FG |
264 | |
265 | rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0); | |
266 | if (!rqpair->cq) { | |
11fdf7f2 | 267 | SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno)); |
7c673cae FG |
268 | return -1; |
269 | } | |
270 | ||
9f95a23c TL |
271 | rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); |
272 | if (g_nvme_hooks.get_ibv_pd) { | |
273 | rctrlr->pd = g_nvme_hooks.get_ibv_pd(&rctrlr->ctrlr.trid, rqpair->cm_id->verbs); | |
274 | } else { | |
275 | rctrlr->pd = NULL; | |
276 | } | |
277 | ||
7c673cae FG |
278 | memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); |
279 | attr.qp_type = IBV_QPT_RC; | |
280 | attr.send_cq = rqpair->cq; | |
281 | attr.recv_cq = rqpair->cq; | |
282 | attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */ | |
283 | attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */ | |
9f95a23c TL |
284 | attr.cap.max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge); |
285 | attr.cap.max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge); | |
286 | ||
287 | rc = rdma_create_qp(rqpair->cm_id, rctrlr->pd, &attr); | |
7c673cae | 288 | |
7c673cae FG |
289 | if (rc) { |
290 | SPDK_ERRLOG("rdma_create_qp failed\n"); | |
291 | return -1; | |
292 | } | |
293 | ||
9f95a23c TL |
294 | /* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */ |
295 | rqpair->max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, attr.cap.max_send_sge); | |
296 | rqpair->max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, attr.cap.max_recv_sge); | |
297 | ||
298 | rctrlr->pd = rqpair->cm_id->qp->pd; | |
299 | ||
7c673cae FG |
300 | rqpair->cm_id->context = &rqpair->qpair; |
301 | ||
302 | return 0; | |
303 | } | |
304 | ||
305 | #define nvme_rdma_trace_ibv_sge(sg_list) \ | |
306 | if (sg_list) { \ | |
11fdf7f2 | 307 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \ |
7c673cae FG |
308 | (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \ |
309 | } | |
310 | ||
311 | static int | |
312 | nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx) | |
313 | { | |
314 | struct ibv_recv_wr *wr, *bad_wr = NULL; | |
315 | int rc; | |
316 | ||
317 | wr = &rqpair->rsp_recv_wrs[rsp_idx]; | |
318 | nvme_rdma_trace_ibv_sge(wr->sg_list); | |
319 | ||
320 | rc = ibv_post_recv(rqpair->cm_id->qp, wr, &bad_wr); | |
321 | if (rc) { | |
322 | SPDK_ERRLOG("Failure posting rdma recv, rc = 0x%x\n", rc); | |
323 | } | |
324 | ||
325 | return rc; | |
326 | } | |
327 | ||
328 | static void | |
9f95a23c | 329 | nvme_rdma_unregister_rsps(struct nvme_rdma_qpair *rqpair) |
7c673cae FG |
330 | { |
331 | if (rqpair->rsp_mr && rdma_dereg_mr(rqpair->rsp_mr)) { | |
332 | SPDK_ERRLOG("Unable to de-register rsp_mr\n"); | |
333 | } | |
334 | rqpair->rsp_mr = NULL; | |
9f95a23c | 335 | } |
7c673cae | 336 | |
9f95a23c TL |
337 | static void |
338 | nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair) | |
339 | { | |
7c673cae FG |
340 | free(rqpair->rsps); |
341 | rqpair->rsps = NULL; | |
342 | free(rqpair->rsp_sgls); | |
343 | rqpair->rsp_sgls = NULL; | |
344 | free(rqpair->rsp_recv_wrs); | |
345 | rqpair->rsp_recv_wrs = NULL; | |
346 | } | |
347 | ||
348 | static int | |
349 | nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair) | |
350 | { | |
7c673cae FG |
351 | rqpair->rsps = NULL; |
352 | rqpair->rsp_recv_wrs = NULL; | |
353 | ||
354 | rqpair->rsp_sgls = calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls)); | |
355 | if (!rqpair->rsp_sgls) { | |
356 | SPDK_ERRLOG("Failed to allocate rsp_sgls\n"); | |
357 | goto fail; | |
358 | } | |
359 | ||
360 | rqpair->rsp_recv_wrs = calloc(rqpair->num_entries, | |
361 | sizeof(*rqpair->rsp_recv_wrs)); | |
362 | if (!rqpair->rsp_recv_wrs) { | |
363 | SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n"); | |
364 | goto fail; | |
365 | } | |
366 | ||
367 | rqpair->rsps = calloc(rqpair->num_entries, sizeof(*rqpair->rsps)); | |
368 | if (!rqpair->rsps) { | |
369 | SPDK_ERRLOG("can not allocate rdma rsps\n"); | |
370 | goto fail; | |
371 | } | |
372 | ||
9f95a23c TL |
373 | return 0; |
374 | fail: | |
375 | nvme_rdma_free_rsps(rqpair); | |
376 | return -ENOMEM; | |
377 | } | |
378 | ||
379 | static int | |
380 | nvme_rdma_register_rsps(struct nvme_rdma_qpair *rqpair) | |
381 | { | |
382 | int i; | |
383 | ||
7c673cae FG |
384 | rqpair->rsp_mr = rdma_reg_msgs(rqpair->cm_id, rqpair->rsps, |
385 | rqpair->num_entries * sizeof(*rqpair->rsps)); | |
386 | if (rqpair->rsp_mr == NULL) { | |
387 | SPDK_ERRLOG("Unable to register rsp_mr\n"); | |
388 | goto fail; | |
389 | } | |
390 | ||
391 | for (i = 0; i < rqpair->num_entries; i++) { | |
392 | struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i]; | |
393 | ||
394 | rsp_sgl->addr = (uint64_t)&rqpair->rsps[i]; | |
395 | rsp_sgl->length = sizeof(rqpair->rsps[i]); | |
396 | rsp_sgl->lkey = rqpair->rsp_mr->lkey; | |
397 | ||
398 | rqpair->rsp_recv_wrs[i].wr_id = i; | |
399 | rqpair->rsp_recv_wrs[i].next = NULL; | |
400 | rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl; | |
401 | rqpair->rsp_recv_wrs[i].num_sge = 1; | |
402 | ||
403 | if (nvme_rdma_post_recv(rqpair, i)) { | |
404 | SPDK_ERRLOG("Unable to post connection rx desc\n"); | |
405 | goto fail; | |
406 | } | |
407 | } | |
408 | ||
409 | return 0; | |
410 | ||
411 | fail: | |
9f95a23c | 412 | nvme_rdma_unregister_rsps(rqpair); |
7c673cae FG |
413 | return -ENOMEM; |
414 | } | |
415 | ||
416 | static void | |
9f95a23c | 417 | nvme_rdma_unregister_reqs(struct nvme_rdma_qpair *rqpair) |
7c673cae | 418 | { |
7c673cae FG |
419 | if (rqpair->cmd_mr && rdma_dereg_mr(rqpair->cmd_mr)) { |
420 | SPDK_ERRLOG("Unable to de-register cmd_mr\n"); | |
421 | } | |
422 | rqpair->cmd_mr = NULL; | |
9f95a23c TL |
423 | } |
424 | ||
425 | static void | |
426 | nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair) | |
427 | { | |
428 | if (!rqpair->rdma_reqs) { | |
429 | return; | |
430 | } | |
7c673cae FG |
431 | |
432 | free(rqpair->cmds); | |
433 | rqpair->cmds = NULL; | |
434 | ||
435 | free(rqpair->rdma_reqs); | |
436 | rqpair->rdma_reqs = NULL; | |
437 | } | |
438 | ||
439 | static int | |
440 | nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair) | |
441 | { | |
7c673cae FG |
442 | rqpair->rdma_reqs = calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req)); |
443 | if (rqpair->rdma_reqs == NULL) { | |
444 | SPDK_ERRLOG("Failed to allocate rdma_reqs\n"); | |
445 | goto fail; | |
446 | } | |
447 | ||
448 | rqpair->cmds = calloc(rqpair->num_entries, sizeof(*rqpair->cmds)); | |
449 | if (!rqpair->cmds) { | |
450 | SPDK_ERRLOG("Failed to allocate RDMA cmds\n"); | |
451 | goto fail; | |
452 | } | |
453 | ||
9f95a23c TL |
454 | return 0; |
455 | fail: | |
456 | nvme_rdma_free_reqs(rqpair); | |
457 | return -ENOMEM; | |
458 | } | |
459 | ||
460 | static int | |
461 | nvme_rdma_register_reqs(struct nvme_rdma_qpair *rqpair) | |
462 | { | |
463 | int i; | |
464 | ||
7c673cae FG |
465 | rqpair->cmd_mr = rdma_reg_msgs(rqpair->cm_id, rqpair->cmds, |
466 | rqpair->num_entries * sizeof(*rqpair->cmds)); | |
467 | if (!rqpair->cmd_mr) { | |
468 | SPDK_ERRLOG("Unable to register cmd_mr\n"); | |
469 | goto fail; | |
470 | } | |
471 | ||
11fdf7f2 TL |
472 | TAILQ_INIT(&rqpair->free_reqs); |
473 | TAILQ_INIT(&rqpair->outstanding_reqs); | |
7c673cae FG |
474 | for (i = 0; i < rqpair->num_entries; i++) { |
475 | struct spdk_nvme_rdma_req *rdma_req; | |
11fdf7f2 | 476 | struct spdk_nvmf_cmd *cmd; |
7c673cae FG |
477 | |
478 | rdma_req = &rqpair->rdma_reqs[i]; | |
479 | cmd = &rqpair->cmds[i]; | |
480 | ||
481 | rdma_req->id = i; | |
482 | ||
11fdf7f2 TL |
483 | /* The first RDMA sgl element will always point |
484 | * at this data structure. Depending on whether | |
485 | * an NVMe-oF SGL is required, the length of | |
486 | * this element may change. */ | |
487 | rdma_req->send_sgl[0].addr = (uint64_t)cmd; | |
488 | rdma_req->send_sgl[0].lkey = rqpair->cmd_mr->lkey; | |
7c673cae FG |
489 | |
490 | rdma_req->send_wr.wr_id = (uint64_t)rdma_req; | |
491 | rdma_req->send_wr.next = NULL; | |
492 | rdma_req->send_wr.opcode = IBV_WR_SEND; | |
493 | rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED; | |
11fdf7f2 | 494 | rdma_req->send_wr.sg_list = rdma_req->send_sgl; |
7c673cae FG |
495 | rdma_req->send_wr.imm_data = 0; |
496 | ||
11fdf7f2 | 497 | TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link); |
7c673cae FG |
498 | } |
499 | ||
500 | return 0; | |
501 | ||
502 | fail: | |
9f95a23c | 503 | nvme_rdma_unregister_reqs(rqpair); |
7c673cae FG |
504 | return -ENOMEM; |
505 | } | |
506 | ||
507 | static int | |
508 | nvme_rdma_recv(struct nvme_rdma_qpair *rqpair, uint64_t rsp_idx) | |
509 | { | |
510 | struct spdk_nvme_qpair *qpair = &rqpair->qpair; | |
511 | struct spdk_nvme_rdma_req *rdma_req; | |
512 | struct spdk_nvme_cpl *rsp; | |
513 | struct nvme_request *req; | |
514 | ||
515 | assert(rsp_idx < rqpair->num_entries); | |
516 | rsp = &rqpair->rsps[rsp_idx]; | |
517 | rdma_req = &rqpair->rdma_reqs[rsp->cid]; | |
518 | ||
519 | req = rdma_req->req; | |
520 | nvme_rdma_req_complete(req, rsp); | |
521 | ||
9f95a23c TL |
522 | if (rdma_req->request_ready_to_put) { |
523 | nvme_rdma_req_put(rqpair, rdma_req); | |
524 | } else { | |
525 | rdma_req->request_ready_to_put = true; | |
526 | } | |
527 | ||
7c673cae FG |
528 | if (nvme_rdma_post_recv(rqpair, rsp_idx)) { |
529 | SPDK_ERRLOG("Unable to re-post rx descriptor\n"); | |
530 | return -1; | |
531 | } | |
532 | ||
533 | if (!STAILQ_EMPTY(&qpair->queued_req) && !qpair->ctrlr->is_resetting) { | |
534 | req = STAILQ_FIRST(&qpair->queued_req); | |
535 | STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); | |
536 | nvme_qpair_submit_request(qpair, req); | |
537 | } | |
538 | ||
539 | return 0; | |
540 | } | |
541 | ||
542 | static int | |
543 | nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair, | |
11fdf7f2 TL |
544 | struct sockaddr *src_addr, |
545 | struct sockaddr *dst_addr, | |
7c673cae FG |
546 | struct rdma_event_channel *cm_channel) |
547 | { | |
548 | int ret; | |
549 | struct rdma_cm_event *event; | |
550 | ||
11fdf7f2 | 551 | ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr, |
7c673cae FG |
552 | NVME_RDMA_TIME_OUT_IN_MS); |
553 | if (ret) { | |
554 | SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno); | |
555 | return ret; | |
556 | } | |
557 | ||
558 | event = nvme_rdma_get_event(cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED); | |
559 | if (event == NULL) { | |
560 | SPDK_ERRLOG("RDMA address resolution error\n"); | |
561 | return -1; | |
562 | } | |
563 | rdma_ack_cm_event(event); | |
564 | ||
565 | ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS); | |
566 | if (ret) { | |
567 | SPDK_ERRLOG("rdma_resolve_route\n"); | |
568 | return ret; | |
569 | } | |
570 | ||
571 | event = nvme_rdma_get_event(cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED); | |
572 | if (event == NULL) { | |
573 | SPDK_ERRLOG("RDMA route resolution error\n"); | |
574 | return -1; | |
575 | } | |
576 | rdma_ack_cm_event(event); | |
577 | ||
578 | return 0; | |
579 | } | |
580 | ||
581 | static int | |
582 | nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) | |
583 | { | |
584 | struct rdma_conn_param param = {}; | |
11fdf7f2 | 585 | struct spdk_nvmf_rdma_request_private_data request_data = {}; |
7c673cae | 586 | struct spdk_nvmf_rdma_accept_private_data *accept_data; |
11fdf7f2 TL |
587 | struct ibv_device_attr attr; |
588 | int ret; | |
589 | struct rdma_cm_event *event; | |
590 | struct spdk_nvme_ctrlr *ctrlr; | |
7c673cae FG |
591 | |
592 | ret = ibv_query_device(rqpair->cm_id->verbs, &attr); | |
593 | if (ret != 0) { | |
594 | SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); | |
595 | return ret; | |
596 | } | |
597 | ||
598 | param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom); | |
599 | ||
600 | ctrlr = rqpair->qpair.ctrlr; | |
601 | if (!ctrlr) { | |
602 | return -1; | |
603 | } | |
604 | ||
7c673cae FG |
605 | request_data.qid = rqpair->qpair.id; |
606 | request_data.hrqsize = rqpair->num_entries; | |
607 | request_data.hsqsize = rqpair->num_entries - 1; | |
11fdf7f2 | 608 | request_data.cntlid = ctrlr->cntlid; |
7c673cae FG |
609 | |
610 | param.private_data = &request_data; | |
611 | param.private_data_len = sizeof(request_data); | |
11fdf7f2 TL |
612 | param.retry_count = 7; |
613 | param.rnr_retry_count = 7; | |
7c673cae FG |
614 | |
615 | ret = rdma_connect(rqpair->cm_id, ¶m); | |
616 | if (ret) { | |
617 | SPDK_ERRLOG("nvme rdma connect error\n"); | |
618 | return ret; | |
619 | } | |
620 | ||
621 | event = nvme_rdma_get_event(rqpair->cm_channel, RDMA_CM_EVENT_ESTABLISHED); | |
622 | if (event == NULL) { | |
623 | SPDK_ERRLOG("RDMA connect error\n"); | |
624 | return -1; | |
625 | } | |
626 | ||
627 | accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data; | |
628 | if (accept_data == NULL) { | |
629 | rdma_ack_cm_event(event); | |
630 | SPDK_ERRLOG("NVMe-oF target did not return accept data\n"); | |
631 | return -1; | |
632 | } | |
633 | ||
11fdf7f2 | 634 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n", |
7c673cae FG |
635 | rqpair->num_entries, accept_data->crqsize); |
636 | ||
637 | rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize); | |
638 | ||
639 | rdma_ack_cm_event(event); | |
640 | ||
641 | return 0; | |
642 | } | |
643 | ||
644 | static int | |
645 | nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) | |
646 | { | |
647 | struct addrinfo *res; | |
648 | struct addrinfo hints; | |
649 | int ret; | |
650 | ||
651 | memset(&hints, 0, sizeof(hints)); | |
652 | hints.ai_family = family; | |
653 | hints.ai_socktype = SOCK_STREAM; | |
654 | hints.ai_protocol = 0; | |
655 | ||
656 | ret = getaddrinfo(addr, service, &hints, &res); | |
657 | if (ret) { | |
11fdf7f2 | 658 | SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); |
7c673cae FG |
659 | return ret; |
660 | } | |
661 | ||
662 | if (res->ai_addrlen > sizeof(*sa)) { | |
663 | SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); | |
664 | ret = EINVAL; | |
665 | } else { | |
666 | memcpy(sa, res->ai_addr, res->ai_addrlen); | |
667 | } | |
668 | ||
669 | freeaddrinfo(res); | |
670 | return ret; | |
671 | } | |
672 | ||
673 | static int | |
7c673cae FG |
674 | nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map, |
675 | enum spdk_mem_map_notify_action action, | |
676 | void *vaddr, size_t size) | |
677 | { | |
678 | struct ibv_pd *pd = cb_ctx; | |
679 | struct ibv_mr *mr; | |
11fdf7f2 | 680 | int rc; |
7c673cae FG |
681 | |
682 | switch (action) { | |
683 | case SPDK_MEM_MAP_NOTIFY_REGISTER: | |
9f95a23c TL |
684 | if (!g_nvme_hooks.get_rkey) { |
685 | mr = ibv_reg_mr(pd, vaddr, size, | |
686 | IBV_ACCESS_LOCAL_WRITE | | |
687 | IBV_ACCESS_REMOTE_READ | | |
688 | IBV_ACCESS_REMOTE_WRITE); | |
689 | if (mr == NULL) { | |
690 | SPDK_ERRLOG("ibv_reg_mr() failed\n"); | |
691 | return -EFAULT; | |
692 | } else { | |
693 | rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); | |
694 | } | |
7c673cae | 695 | } else { |
9f95a23c TL |
696 | rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, |
697 | g_nvme_hooks.get_rkey(pd, vaddr, size)); | |
7c673cae FG |
698 | } |
699 | break; | |
700 | case SPDK_MEM_MAP_NOTIFY_UNREGISTER: | |
9f95a23c TL |
701 | if (!g_nvme_hooks.get_rkey) { |
702 | mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); | |
703 | if (mr) { | |
704 | ibv_dereg_mr(mr); | |
705 | } | |
7c673cae | 706 | } |
9f95a23c | 707 | rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); |
7c673cae | 708 | break; |
11fdf7f2 TL |
709 | default: |
710 | SPDK_UNREACHABLE(); | |
7c673cae | 711 | } |
7c673cae | 712 | |
11fdf7f2 TL |
713 | return rc; |
714 | } | |
7c673cae | 715 | |
9f95a23c TL |
716 | static int |
717 | nvme_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) | |
718 | { | |
719 | /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ | |
720 | return addr_1 == addr_2; | |
721 | } | |
722 | ||
7c673cae FG |
723 | static int |
724 | nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair) | |
725 | { | |
726 | struct ibv_pd *pd = rqpair->cm_id->qp->pd; | |
11fdf7f2 TL |
727 | struct spdk_nvme_rdma_mr_map *mr_map; |
728 | const struct spdk_mem_map_ops nvme_rdma_map_ops = { | |
729 | .notify_cb = nvme_rdma_mr_map_notify, | |
9f95a23c | 730 | .are_contiguous = nvme_rdma_check_contiguous_entries |
11fdf7f2 TL |
731 | }; |
732 | ||
733 | pthread_mutex_lock(&g_rdma_mr_maps_mutex); | |
734 | ||
735 | /* Look up existing mem map registration for this pd */ | |
736 | LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) { | |
737 | if (mr_map->pd == pd) { | |
738 | mr_map->ref++; | |
739 | rqpair->mr_map = mr_map; | |
740 | pthread_mutex_unlock(&g_rdma_mr_maps_mutex); | |
741 | return 0; | |
742 | } | |
743 | } | |
7c673cae | 744 | |
11fdf7f2 | 745 | mr_map = calloc(1, sizeof(*mr_map)); |
7c673cae | 746 | if (mr_map == NULL) { |
11fdf7f2 TL |
747 | SPDK_ERRLOG("calloc() failed\n"); |
748 | pthread_mutex_unlock(&g_rdma_mr_maps_mutex); | |
749 | return -1; | |
750 | } | |
751 | ||
752 | mr_map->ref = 1; | |
753 | mr_map->pd = pd; | |
754 | mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd); | |
755 | if (mr_map->map == NULL) { | |
7c673cae | 756 | SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); |
11fdf7f2 TL |
757 | free(mr_map); |
758 | pthread_mutex_unlock(&g_rdma_mr_maps_mutex); | |
7c673cae FG |
759 | return -1; |
760 | } | |
761 | ||
762 | rqpair->mr_map = mr_map; | |
11fdf7f2 TL |
763 | LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link); |
764 | ||
765 | pthread_mutex_unlock(&g_rdma_mr_maps_mutex); | |
7c673cae FG |
766 | |
767 | return 0; | |
768 | } | |
769 | ||
770 | static void | |
771 | nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair) | |
772 | { | |
11fdf7f2 TL |
773 | struct spdk_nvme_rdma_mr_map *mr_map; |
774 | ||
775 | mr_map = rqpair->mr_map; | |
776 | rqpair->mr_map = NULL; | |
777 | ||
778 | if (mr_map == NULL) { | |
779 | return; | |
780 | } | |
781 | ||
782 | pthread_mutex_lock(&g_rdma_mr_maps_mutex); | |
783 | ||
784 | assert(mr_map->ref > 0); | |
785 | mr_map->ref--; | |
786 | if (mr_map->ref == 0) { | |
787 | LIST_REMOVE(mr_map, link); | |
788 | spdk_mem_map_free(&mr_map->map); | |
789 | free(mr_map); | |
790 | } | |
791 | ||
792 | pthread_mutex_unlock(&g_rdma_mr_maps_mutex); | |
7c673cae FG |
793 | } |
794 | ||
795 | static int | |
796 | nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair) | |
797 | { | |
11fdf7f2 TL |
798 | struct sockaddr_storage dst_addr; |
799 | struct sockaddr_storage src_addr; | |
800 | bool src_addr_specified; | |
7c673cae FG |
801 | int rc; |
802 | struct spdk_nvme_ctrlr *ctrlr; | |
803 | int family; | |
804 | ||
805 | rqpair->cm_channel = rdma_create_event_channel(); | |
806 | if (rqpair->cm_channel == NULL) { | |
807 | SPDK_ERRLOG("rdma_create_event_channel() failed\n"); | |
808 | return -1; | |
809 | } | |
810 | ||
811 | ctrlr = rqpair->qpair.ctrlr; | |
812 | ||
813 | switch (ctrlr->trid.adrfam) { | |
814 | case SPDK_NVMF_ADRFAM_IPV4: | |
815 | family = AF_INET; | |
816 | break; | |
817 | case SPDK_NVMF_ADRFAM_IPV6: | |
818 | family = AF_INET6; | |
819 | break; | |
820 | default: | |
821 | SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); | |
822 | return -1; | |
823 | } | |
824 | ||
11fdf7f2 | 825 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); |
7c673cae | 826 | |
11fdf7f2 | 827 | memset(&dst_addr, 0, sizeof(dst_addr)); |
7c673cae | 828 | |
11fdf7f2 TL |
829 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid); |
830 | rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); | |
7c673cae | 831 | if (rc != 0) { |
11fdf7f2 | 832 | SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n"); |
7c673cae FG |
833 | return -1; |
834 | } | |
835 | ||
11fdf7f2 TL |
836 | if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { |
837 | memset(&src_addr, 0, sizeof(src_addr)); | |
838 | rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); | |
839 | if (rc != 0) { | |
840 | SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n"); | |
841 | return -1; | |
842 | } | |
843 | src_addr_specified = true; | |
844 | } else { | |
845 | src_addr_specified = false; | |
846 | } | |
847 | ||
7c673cae FG |
848 | rc = rdma_create_id(rqpair->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); |
849 | if (rc < 0) { | |
850 | SPDK_ERRLOG("rdma_create_id() failed\n"); | |
851 | return -1; | |
852 | } | |
853 | ||
11fdf7f2 TL |
854 | rc = nvme_rdma_resolve_addr(rqpair, |
855 | src_addr_specified ? (struct sockaddr *)&src_addr : NULL, | |
856 | (struct sockaddr *)&dst_addr, rqpair->cm_channel); | |
7c673cae FG |
857 | if (rc < 0) { |
858 | SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n"); | |
859 | return -1; | |
860 | } | |
861 | ||
862 | rc = nvme_rdma_qpair_init(rqpair); | |
863 | if (rc < 0) { | |
864 | SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n"); | |
865 | return -1; | |
866 | } | |
867 | ||
868 | rc = nvme_rdma_connect(rqpair); | |
869 | if (rc != 0) { | |
870 | SPDK_ERRLOG("Unable to connect the rqpair\n"); | |
871 | return -1; | |
872 | } | |
873 | ||
9f95a23c | 874 | rc = nvme_rdma_register_reqs(rqpair); |
11fdf7f2 | 875 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); |
7c673cae | 876 | if (rc) { |
9f95a23c | 877 | SPDK_ERRLOG("Unable to register rqpair RDMA requests\n"); |
7c673cae FG |
878 | return -1; |
879 | } | |
9f95a23c | 880 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests registered\n"); |
7c673cae | 881 | |
9f95a23c | 882 | rc = nvme_rdma_register_rsps(rqpair); |
11fdf7f2 | 883 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); |
7c673cae | 884 | if (rc < 0) { |
9f95a23c | 885 | SPDK_ERRLOG("Unable to register rqpair RDMA responses\n"); |
7c673cae FG |
886 | return -1; |
887 | } | |
9f95a23c | 888 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses registered\n"); |
7c673cae FG |
889 | |
890 | rc = nvme_rdma_register_mem(rqpair); | |
891 | if (rc < 0) { | |
892 | SPDK_ERRLOG("Unable to register memory for RDMA\n"); | |
893 | return -1; | |
894 | } | |
895 | ||
11fdf7f2 | 896 | rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries); |
7c673cae FG |
897 | if (rc < 0) { |
898 | SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); | |
899 | return -1; | |
900 | } | |
901 | ||
902 | return 0; | |
903 | } | |
904 | ||
905 | /* | |
906 | * Build SGL describing empty payload. | |
907 | */ | |
908 | static int | |
11fdf7f2 | 909 | nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req) |
7c673cae | 910 | { |
11fdf7f2 | 911 | struct nvme_request *req = rdma_req->req; |
7c673cae | 912 | |
11fdf7f2 | 913 | req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; |
7c673cae | 914 | |
11fdf7f2 TL |
915 | /* The first element of this SGL is pointing at an |
916 | * spdk_nvmf_cmd object. For this particular command, | |
917 | * we only need the first 64 bytes corresponding to | |
918 | * the NVMe command. */ | |
919 | rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); | |
920 | ||
921 | /* The RDMA SGL needs one element describing the NVMe command. */ | |
922 | rdma_req->send_wr.num_sge = 1; | |
923 | ||
924 | req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; | |
925 | req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; | |
926 | req->cmd.dptr.sgl1.keyed.length = 0; | |
927 | req->cmd.dptr.sgl1.keyed.key = 0; | |
928 | req->cmd.dptr.sgl1.address = 0; | |
7c673cae FG |
929 | |
930 | return 0; | |
931 | } | |
932 | ||
933 | /* | |
11fdf7f2 | 934 | * Build inline SGL describing contiguous payload buffer. |
7c673cae FG |
935 | */ |
936 | static int | |
11fdf7f2 TL |
937 | nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair, |
938 | struct spdk_nvme_rdma_req *rdma_req) | |
7c673cae | 939 | { |
11fdf7f2 | 940 | struct nvme_request *req = rdma_req->req; |
7c673cae | 941 | struct ibv_mr *mr; |
11fdf7f2 TL |
942 | void *payload; |
943 | uint64_t requested_size; | |
7c673cae | 944 | |
11fdf7f2 | 945 | payload = req->payload.contig_or_cb_arg + req->payload_offset; |
7c673cae | 946 | assert(req->payload_size != 0); |
11fdf7f2 | 947 | assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); |
7c673cae | 948 | |
11fdf7f2 | 949 | requested_size = req->payload_size; |
11fdf7f2 | 950 | |
9f95a23c TL |
951 | if (!g_nvme_hooks.get_rkey) { |
952 | mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, | |
953 | (uint64_t)payload, &requested_size); | |
954 | ||
955 | if (mr == NULL || requested_size < req->payload_size) { | |
956 | if (mr) { | |
957 | SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); | |
958 | } | |
959 | return -EINVAL; | |
960 | } | |
961 | rdma_req->send_sgl[1].lkey = mr->lkey; | |
962 | } else { | |
963 | rdma_req->send_sgl[1].lkey = spdk_mem_map_translate(rqpair->mr_map->map, | |
964 | (uint64_t)payload, | |
965 | &requested_size); | |
966 | ||
7c673cae FG |
967 | } |
968 | ||
11fdf7f2 TL |
969 | /* The first element of this SGL is pointing at an |
970 | * spdk_nvmf_cmd object. For this particular command, | |
971 | * we only need the first 64 bytes corresponding to | |
972 | * the NVMe command. */ | |
973 | rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); | |
974 | ||
975 | rdma_req->send_sgl[1].addr = (uint64_t)payload; | |
976 | rdma_req->send_sgl[1].length = (uint32_t)req->payload_size; | |
11fdf7f2 TL |
977 | |
978 | /* The RDMA SGL contains two elements. The first describes | |
979 | * the NVMe command and the second describes the data | |
980 | * payload. */ | |
981 | rdma_req->send_wr.num_sge = 2; | |
982 | ||
983 | req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; | |
984 | req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; | |
985 | req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; | |
986 | req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; | |
987 | /* Inline only supported for icdoff == 0 currently. This function will | |
988 | * not get called for controllers with other values. */ | |
989 | req->cmd.dptr.sgl1.address = (uint64_t)0; | |
7c673cae FG |
990 | |
991 | return 0; | |
992 | } | |
993 | ||
994 | /* | |
11fdf7f2 | 995 | * Build SGL describing contiguous payload buffer. |
7c673cae FG |
996 | */ |
997 | static int | |
11fdf7f2 TL |
998 | nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, |
999 | struct spdk_nvme_rdma_req *rdma_req) | |
7c673cae | 1000 | { |
11fdf7f2 TL |
1001 | struct nvme_request *req = rdma_req->req; |
1002 | void *payload = req->payload.contig_or_cb_arg + req->payload_offset; | |
7c673cae | 1003 | struct ibv_mr *mr; |
11fdf7f2 | 1004 | uint64_t requested_size; |
7c673cae FG |
1005 | |
1006 | assert(req->payload_size != 0); | |
11fdf7f2 | 1007 | assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); |
7c673cae | 1008 | |
11fdf7f2 | 1009 | requested_size = req->payload_size; |
9f95a23c TL |
1010 | if (!g_nvme_hooks.get_rkey) { |
1011 | ||
1012 | mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)payload, | |
1013 | &requested_size); | |
1014 | if (mr == NULL) { | |
1015 | return -1; | |
1016 | } | |
1017 | req->cmd.dptr.sgl1.keyed.key = mr->rkey; | |
1018 | } else { | |
1019 | req->cmd.dptr.sgl1.keyed.key = spdk_mem_map_translate(rqpair->mr_map->map, | |
1020 | (uint64_t)payload, | |
1021 | &requested_size); | |
1022 | } | |
1023 | ||
1024 | if (requested_size < req->payload_size) { | |
1025 | SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); | |
7c673cae FG |
1026 | return -1; |
1027 | } | |
1028 | ||
11fdf7f2 TL |
1029 | /* The first element of this SGL is pointing at an |
1030 | * spdk_nvmf_cmd object. For this particular command, | |
1031 | * we only need the first 64 bytes corresponding to | |
1032 | * the NVMe command. */ | |
1033 | rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); | |
7c673cae | 1034 | |
11fdf7f2 TL |
1035 | /* The RDMA SGL needs one element describing the NVMe command. */ |
1036 | rdma_req->send_wr.num_sge = 1; | |
7c673cae | 1037 | |
11fdf7f2 | 1038 | req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; |
7c673cae FG |
1039 | req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; |
1040 | req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; | |
11fdf7f2 | 1041 | req->cmd.dptr.sgl1.keyed.length = req->payload_size; |
11fdf7f2 | 1042 | req->cmd.dptr.sgl1.address = (uint64_t)payload; |
7c673cae FG |
1043 | |
1044 | return 0; | |
1045 | } | |
1046 | ||
11fdf7f2 TL |
1047 | /* |
1048 | * Build SGL describing scattered payload buffer. | |
1049 | */ | |
7c673cae | 1050 | static int |
11fdf7f2 TL |
1051 | nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, |
1052 | struct spdk_nvme_rdma_req *rdma_req) | |
7c673cae | 1053 | { |
11fdf7f2 TL |
1054 | struct nvme_request *req = rdma_req->req; |
1055 | struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id]; | |
1056 | struct ibv_mr *mr = NULL; | |
1057 | void *virt_addr; | |
1058 | uint64_t remaining_size, mr_length; | |
1059 | uint32_t sge_length; | |
1060 | int rc, max_num_sgl, num_sgl_desc; | |
7c673cae | 1061 | |
11fdf7f2 TL |
1062 | assert(req->payload_size != 0); |
1063 | assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); | |
1064 | assert(req->payload.reset_sgl_fn != NULL); | |
1065 | assert(req->payload.next_sge_fn != NULL); | |
1066 | req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); | |
7c673cae | 1067 | |
11fdf7f2 | 1068 | max_num_sgl = req->qpair->ctrlr->max_sges; |
7c673cae | 1069 | |
11fdf7f2 TL |
1070 | remaining_size = req->payload_size; |
1071 | num_sgl_desc = 0; | |
1072 | do { | |
1073 | rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length); | |
1074 | if (rc) { | |
1075 | return -1; | |
1076 | } | |
7c673cae | 1077 | |
11fdf7f2 TL |
1078 | sge_length = spdk_min(remaining_size, sge_length); |
1079 | mr_length = sge_length; | |
7c673cae | 1080 | |
9f95a23c TL |
1081 | if (!g_nvme_hooks.get_rkey) { |
1082 | mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, | |
1083 | (uint64_t)virt_addr, | |
1084 | &mr_length); | |
1085 | if (mr == NULL) { | |
1086 | return -1; | |
1087 | } | |
1088 | cmd->sgl[num_sgl_desc].keyed.key = mr->rkey; | |
1089 | } else { | |
1090 | cmd->sgl[num_sgl_desc].keyed.key = spdk_mem_map_translate(rqpair->mr_map->map, | |
1091 | (uint64_t)virt_addr, | |
1092 | &mr_length); | |
1093 | } | |
7c673cae | 1094 | |
9f95a23c TL |
1095 | if (mr_length < sge_length) { |
1096 | SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); | |
11fdf7f2 TL |
1097 | return -1; |
1098 | } | |
7c673cae | 1099 | |
11fdf7f2 TL |
1100 | cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; |
1101 | cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; | |
1102 | cmd->sgl[num_sgl_desc].keyed.length = sge_length; | |
11fdf7f2 | 1103 | cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr; |
7c673cae | 1104 | |
11fdf7f2 TL |
1105 | remaining_size -= sge_length; |
1106 | num_sgl_desc++; | |
1107 | } while (remaining_size > 0 && num_sgl_desc < max_num_sgl); | |
1108 | ||
1109 | ||
1110 | /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ | |
1111 | if (remaining_size > 0) { | |
7c673cae FG |
1112 | return -1; |
1113 | } | |
1114 | ||
11fdf7f2 | 1115 | req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; |
7c673cae | 1116 | |
11fdf7f2 TL |
1117 | /* The RDMA SGL needs one element describing some portion |
1118 | * of the spdk_nvmf_cmd structure. */ | |
1119 | rdma_req->send_wr.num_sge = 1; | |
1120 | ||
1121 | /* | |
1122 | * If only one SGL descriptor is required, it can be embedded directly in the command | |
1123 | * as a data block descriptor. | |
1124 | */ | |
1125 | if (num_sgl_desc == 1) { | |
1126 | /* The first element of this SGL is pointing at an | |
1127 | * spdk_nvmf_cmd object. For this particular command, | |
1128 | * we only need the first 64 bytes corresponding to | |
1129 | * the NVMe command. */ | |
1130 | rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); | |
1131 | ||
9f95a23c TL |
1132 | req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type; |
1133 | req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype; | |
1134 | req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length; | |
1135 | req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key; | |
1136 | req->cmd.dptr.sgl1.address = cmd->sgl[0].address; | |
11fdf7f2 TL |
1137 | } else { |
1138 | /* | |
1139 | * Otherwise, The SGL descriptor embedded in the command must point to the list of | |
1140 | * SGL descriptors used to describe the operation. In that case it is a last segment descriptor. | |
1141 | */ | |
1142 | rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct | |
1143 | spdk_nvme_sgl_descriptor) * num_sgl_desc; | |
1144 | ||
1145 | req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; | |
1146 | req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; | |
1147 | req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor); | |
1148 | req->cmd.dptr.sgl1.address = (uint64_t)0; | |
7c673cae FG |
1149 | } |
1150 | ||
1151 | return 0; | |
1152 | } | |
1153 | ||
11fdf7f2 TL |
1154 | /* |
1155 | * Build inline SGL describing sgl payload buffer. | |
1156 | */ | |
7c673cae | 1157 | static int |
11fdf7f2 TL |
1158 | nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair, |
1159 | struct spdk_nvme_rdma_req *rdma_req) | |
7c673cae | 1160 | { |
11fdf7f2 TL |
1161 | struct nvme_request *req = rdma_req->req; |
1162 | struct ibv_mr *mr; | |
1163 | uint32_t length; | |
1164 | uint64_t requested_size; | |
1165 | void *virt_addr; | |
9f95a23c | 1166 | int rc, i; |
7c673cae | 1167 | |
11fdf7f2 TL |
1168 | assert(req->payload_size != 0); |
1169 | assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); | |
1170 | assert(req->payload.reset_sgl_fn != NULL); | |
1171 | assert(req->payload.next_sge_fn != NULL); | |
1172 | req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); | |
7c673cae | 1173 | |
11fdf7f2 TL |
1174 | rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); |
1175 | if (rc) { | |
7c673cae FG |
1176 | return -1; |
1177 | } | |
1178 | ||
11fdf7f2 | 1179 | if (length < req->payload_size) { |
9f95a23c TL |
1180 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "Inline SGL request split so sending separately.\n"); |
1181 | return nvme_rdma_build_sgl_request(rqpair, rdma_req); | |
7c673cae FG |
1182 | } |
1183 | ||
9f95a23c TL |
1184 | if (length > req->payload_size) { |
1185 | length = req->payload_size; | |
1186 | } | |
1187 | ||
1188 | requested_size = length; | |
11fdf7f2 TL |
1189 | mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)virt_addr, |
1190 | &requested_size); | |
9f95a23c TL |
1191 | if (mr == NULL || requested_size < length) { |
1192 | for (i = 1; i < rdma_req->send_wr.num_sge; i++) { | |
1193 | rdma_req->send_sgl[i].addr = 0; | |
1194 | rdma_req->send_sgl[i].length = 0; | |
1195 | rdma_req->send_sgl[i].lkey = 0; | |
1196 | } | |
1197 | ||
1198 | if (mr) { | |
1199 | SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); | |
1200 | } | |
7c673cae FG |
1201 | return -1; |
1202 | } | |
1203 | ||
9f95a23c TL |
1204 | rdma_req->send_sgl[1].addr = (uint64_t)virt_addr; |
1205 | rdma_req->send_sgl[1].length = length; | |
1206 | rdma_req->send_sgl[1].lkey = mr->lkey; | |
1207 | ||
1208 | rdma_req->send_wr.num_sge = 2; | |
1209 | ||
11fdf7f2 TL |
1210 | /* The first element of this SGL is pointing at an |
1211 | * spdk_nvmf_cmd object. For this particular command, | |
1212 | * we only need the first 64 bytes corresponding to | |
1213 | * the NVMe command. */ | |
1214 | rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); | |
1215 | ||
11fdf7f2 TL |
1216 | req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; |
1217 | req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; | |
1218 | req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; | |
1219 | req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; | |
1220 | /* Inline only supported for icdoff == 0 currently. This function will | |
1221 | * not get called for controllers with other values. */ | |
1222 | req->cmd.dptr.sgl1.address = (uint64_t)0; | |
1223 | ||
1224 | return 0; | |
1225 | } | |
1226 | ||
1227 | static inline unsigned int | |
1228 | nvme_rdma_icdsz_bytes(struct spdk_nvme_ctrlr *ctrlr) | |
1229 | { | |
1230 | return (ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd)); | |
1231 | } | |
1232 | ||
1233 | static int | |
1234 | nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req, | |
1235 | struct spdk_nvme_rdma_req *rdma_req) | |
1236 | { | |
1237 | struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr; | |
1238 | int rc; | |
1239 | ||
1240 | rdma_req->req = req; | |
1241 | req->cmd.cid = rdma_req->id; | |
7c673cae | 1242 | |
11fdf7f2 TL |
1243 | if (req->payload_size == 0) { |
1244 | rc = nvme_rdma_build_null_request(rdma_req); | |
1245 | } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { | |
1246 | /* | |
1247 | * Check if icdoff is non zero, to avoid interop conflicts with | |
1248 | * targets with non-zero icdoff. Both SPDK and the Linux kernel | |
1249 | * targets use icdoff = 0. For targets with non-zero icdoff, we | |
1250 | * will currently just not use inline data for now. | |
1251 | */ | |
1252 | if (req->cmd.opc == SPDK_NVME_OPC_WRITE && | |
1253 | req->payload_size <= nvme_rdma_icdsz_bytes(ctrlr) && | |
1254 | (ctrlr->cdata.nvmf_specific.icdoff == 0)) { | |
1255 | rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req); | |
1256 | } else { | |
1257 | rc = nvme_rdma_build_contig_request(rqpair, rdma_req); | |
1258 | } | |
1259 | } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { | |
1260 | if (req->cmd.opc == SPDK_NVME_OPC_WRITE && | |
1261 | req->payload_size <= nvme_rdma_icdsz_bytes(ctrlr) && | |
1262 | ctrlr->cdata.nvmf_specific.icdoff == 0) { | |
1263 | rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req); | |
1264 | } else { | |
1265 | rc = nvme_rdma_build_sgl_request(rqpair, rdma_req); | |
1266 | } | |
7c673cae | 1267 | } else { |
11fdf7f2 | 1268 | rc = -1; |
7c673cae FG |
1269 | } |
1270 | ||
11fdf7f2 TL |
1271 | if (rc) { |
1272 | return rc; | |
1273 | } | |
1274 | ||
1275 | memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd)); | |
7c673cae FG |
1276 | return 0; |
1277 | } | |
1278 | ||
1279 | static struct spdk_nvme_qpair * | |
1280 | nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, | |
1281 | uint16_t qid, uint32_t qsize, | |
1282 | enum spdk_nvme_qprio qprio, | |
1283 | uint32_t num_requests) | |
1284 | { | |
1285 | struct nvme_rdma_qpair *rqpair; | |
1286 | struct spdk_nvme_qpair *qpair; | |
1287 | int rc; | |
1288 | ||
1289 | rqpair = calloc(1, sizeof(struct nvme_rdma_qpair)); | |
1290 | if (!rqpair) { | |
1291 | SPDK_ERRLOG("failed to get create rqpair\n"); | |
1292 | return NULL; | |
1293 | } | |
1294 | ||
1295 | rqpair->num_entries = qsize; | |
1296 | ||
1297 | qpair = &rqpair->qpair; | |
1298 | ||
1299 | rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); | |
1300 | if (rc != 0) { | |
1301 | return NULL; | |
1302 | } | |
1303 | ||
9f95a23c TL |
1304 | rc = nvme_rdma_alloc_reqs(rqpair); |
1305 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); | |
1306 | if (rc) { | |
1307 | SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n"); | |
1308 | return NULL; | |
1309 | } | |
1310 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n"); | |
1311 | ||
1312 | rc = nvme_rdma_alloc_rsps(rqpair); | |
1313 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); | |
1314 | if (rc < 0) { | |
1315 | SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n"); | |
1316 | return NULL; | |
1317 | } | |
1318 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n"); | |
1319 | ||
7c673cae FG |
1320 | rc = nvme_rdma_qpair_connect(rqpair); |
1321 | if (rc < 0) { | |
1322 | nvme_rdma_qpair_destroy(qpair); | |
1323 | return NULL; | |
1324 | } | |
1325 | ||
1326 | return qpair; | |
1327 | } | |
1328 | ||
9f95a23c TL |
1329 | static void |
1330 | nvme_rdma_qpair_disconnect(struct spdk_nvme_qpair *qpair) | |
7c673cae | 1331 | { |
9f95a23c | 1332 | struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); |
7c673cae FG |
1333 | |
1334 | nvme_rdma_unregister_mem(rqpair); | |
9f95a23c TL |
1335 | nvme_rdma_unregister_reqs(rqpair); |
1336 | nvme_rdma_unregister_rsps(rqpair); | |
7c673cae FG |
1337 | |
1338 | if (rqpair->cm_id) { | |
1339 | if (rqpair->cm_id->qp) { | |
1340 | rdma_destroy_qp(rqpair->cm_id); | |
1341 | } | |
1342 | rdma_destroy_id(rqpair->cm_id); | |
1343 | } | |
1344 | ||
1345 | if (rqpair->cq) { | |
1346 | ibv_destroy_cq(rqpair->cq); | |
1347 | } | |
1348 | ||
1349 | if (rqpair->cm_channel) { | |
1350 | rdma_destroy_event_channel(rqpair->cm_channel); | |
1351 | } | |
9f95a23c | 1352 | } |
7c673cae | 1353 | |
9f95a23c TL |
1354 | static int |
1355 | nvme_rdma_qpair_destroy(struct spdk_nvme_qpair *qpair) | |
1356 | { | |
1357 | struct nvme_rdma_qpair *rqpair; | |
1358 | ||
1359 | if (!qpair) { | |
1360 | return -1; | |
1361 | } | |
1362 | nvme_rdma_qpair_disconnect(qpair); | |
1363 | nvme_rdma_qpair_abort_reqs(qpair, 1); | |
1364 | nvme_qpair_deinit(qpair); | |
1365 | ||
1366 | rqpair = nvme_rdma_qpair(qpair); | |
1367 | ||
1368 | nvme_rdma_free_reqs(rqpair); | |
1369 | nvme_rdma_free_rsps(rqpair); | |
7c673cae FG |
1370 | free(rqpair); |
1371 | ||
1372 | return 0; | |
1373 | } | |
1374 | ||
1375 | struct spdk_nvme_qpair * | |
1376 | nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, | |
11fdf7f2 | 1377 | const struct spdk_nvme_io_qpair_opts *opts) |
7c673cae | 1378 | { |
11fdf7f2 TL |
1379 | return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, |
1380 | opts->io_queue_requests); | |
7c673cae FG |
1381 | } |
1382 | ||
1383 | int | |
1384 | nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) | |
1385 | { | |
1386 | /* do nothing here */ | |
1387 | return 0; | |
1388 | } | |
1389 | ||
7c673cae FG |
1390 | /* This function must only be called while holding g_spdk_nvme_driver->lock */ |
1391 | int | |
9f95a23c | 1392 | nvme_rdma_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, |
11fdf7f2 | 1393 | bool direct_connect) |
7c673cae FG |
1394 | { |
1395 | struct spdk_nvme_ctrlr_opts discovery_opts; | |
1396 | struct spdk_nvme_ctrlr *discovery_ctrlr; | |
7c673cae | 1397 | union spdk_nvme_cc_register cc; |
7c673cae | 1398 | int rc; |
11fdf7f2 | 1399 | struct nvme_completion_poll_status status; |
7c673cae | 1400 | |
9f95a23c | 1401 | if (strcmp(probe_ctx->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) { |
11fdf7f2 | 1402 | /* It is not a discovery_ctrlr info and try to directly connect it */ |
9f95a23c | 1403 | rc = nvme_ctrlr_probe(&probe_ctx->trid, probe_ctx, NULL); |
11fdf7f2 TL |
1404 | return rc; |
1405 | } | |
1406 | ||
1407 | spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts)); | |
7c673cae FG |
1408 | /* For discovery_ctrlr set the timeout to 0 */ |
1409 | discovery_opts.keep_alive_timeout_ms = 0; | |
1410 | ||
9f95a23c | 1411 | discovery_ctrlr = nvme_rdma_ctrlr_construct(&probe_ctx->trid, &discovery_opts, NULL); |
7c673cae FG |
1412 | if (discovery_ctrlr == NULL) { |
1413 | return -1; | |
1414 | } | |
1415 | ||
1416 | /* TODO: this should be using the normal NVMe controller initialization process */ | |
1417 | cc.raw = 0; | |
1418 | cc.bits.en = 1; | |
1419 | cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ | |
1420 | cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ | |
1421 | rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), | |
1422 | cc.raw); | |
1423 | if (rc < 0) { | |
1424 | SPDK_ERRLOG("Failed to set cc\n"); | |
1425 | nvme_ctrlr_destruct(discovery_ctrlr); | |
1426 | return -1; | |
1427 | } | |
1428 | ||
11fdf7f2 TL |
1429 | /* Direct attach through spdk_nvme_connect() API */ |
1430 | if (direct_connect == true) { | |
9f95a23c TL |
1431 | /* get the cdata info */ |
1432 | rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, | |
1433 | &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata), | |
1434 | nvme_completion_poll_cb, &status); | |
1435 | if (rc != 0) { | |
1436 | SPDK_ERRLOG("Failed to identify cdata\n"); | |
1437 | return rc; | |
1438 | } | |
1439 | ||
1440 | if (spdk_nvme_wait_for_completion(discovery_ctrlr->adminq, &status)) { | |
1441 | SPDK_ERRLOG("nvme_identify_controller failed!\n"); | |
1442 | return -ENXIO; | |
1443 | } | |
1444 | ||
11fdf7f2 TL |
1445 | /* Set the ready state to skip the normal init process */ |
1446 | discovery_ctrlr->state = NVME_CTRLR_STATE_READY; | |
9f95a23c | 1447 | nvme_ctrlr_connected(probe_ctx, discovery_ctrlr); |
11fdf7f2 TL |
1448 | nvme_ctrlr_add_process(discovery_ctrlr, 0); |
1449 | return 0; | |
7c673cae FG |
1450 | } |
1451 | ||
9f95a23c | 1452 | rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, probe_ctx); |
7c673cae | 1453 | nvme_ctrlr_destruct(discovery_ctrlr); |
11fdf7f2 | 1454 | return rc; |
7c673cae FG |
1455 | } |
1456 | ||
1457 | struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid, | |
1458 | const struct spdk_nvme_ctrlr_opts *opts, | |
1459 | void *devhandle) | |
1460 | { | |
1461 | struct nvme_rdma_ctrlr *rctrlr; | |
1462 | union spdk_nvme_cap_register cap; | |
11fdf7f2 | 1463 | union spdk_nvme_vs_register vs; |
7c673cae FG |
1464 | int rc; |
1465 | ||
1466 | rctrlr = calloc(1, sizeof(struct nvme_rdma_ctrlr)); | |
1467 | if (rctrlr == NULL) { | |
1468 | SPDK_ERRLOG("could not allocate ctrlr\n"); | |
1469 | return NULL; | |
1470 | } | |
1471 | ||
1472 | rctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_RDMA; | |
1473 | rctrlr->ctrlr.opts = *opts; | |
1474 | memcpy(&rctrlr->ctrlr.trid, trid, sizeof(rctrlr->ctrlr.trid)); | |
1475 | ||
1476 | rc = nvme_ctrlr_construct(&rctrlr->ctrlr); | |
1477 | if (rc != 0) { | |
11fdf7f2 | 1478 | free(rctrlr); |
7c673cae FG |
1479 | return NULL; |
1480 | } | |
1481 | ||
1482 | rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0, | |
1483 | SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES, 0, SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES); | |
1484 | if (!rctrlr->ctrlr.adminq) { | |
1485 | SPDK_ERRLOG("failed to create admin qpair\n"); | |
11fdf7f2 | 1486 | nvme_rdma_ctrlr_destruct(&rctrlr->ctrlr); |
7c673cae FG |
1487 | return NULL; |
1488 | } | |
1489 | ||
1490 | if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) { | |
1491 | SPDK_ERRLOG("get_cap() failed\n"); | |
1492 | nvme_ctrlr_destruct(&rctrlr->ctrlr); | |
1493 | return NULL; | |
1494 | } | |
1495 | ||
11fdf7f2 TL |
1496 | if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) { |
1497 | SPDK_ERRLOG("get_vs() failed\n"); | |
1498 | nvme_ctrlr_destruct(&rctrlr->ctrlr); | |
1499 | return NULL; | |
1500 | } | |
1501 | ||
1502 | if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) { | |
1503 | SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); | |
1504 | nvme_ctrlr_destruct(&rctrlr->ctrlr); | |
1505 | return NULL; | |
1506 | } | |
1507 | ||
1508 | nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs); | |
7c673cae | 1509 | |
11fdf7f2 | 1510 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n"); |
7c673cae FG |
1511 | return &rctrlr->ctrlr; |
1512 | } | |
1513 | ||
1514 | int | |
1515 | nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) | |
1516 | { | |
1517 | struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); | |
1518 | ||
1519 | if (ctrlr->adminq) { | |
1520 | nvme_rdma_qpair_destroy(ctrlr->adminq); | |
1521 | } | |
1522 | ||
11fdf7f2 TL |
1523 | nvme_ctrlr_destruct_finish(ctrlr); |
1524 | ||
7c673cae FG |
1525 | free(rctrlr); |
1526 | ||
1527 | return 0; | |
1528 | } | |
1529 | ||
1530 | int | |
1531 | nvme_rdma_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) | |
1532 | { | |
11fdf7f2 | 1533 | return nvme_fabric_ctrlr_set_reg_4(ctrlr, offset, value); |
7c673cae FG |
1534 | } |
1535 | ||
1536 | int | |
1537 | nvme_rdma_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) | |
1538 | { | |
11fdf7f2 | 1539 | return nvme_fabric_ctrlr_set_reg_8(ctrlr, offset, value); |
7c673cae FG |
1540 | } |
1541 | ||
1542 | int | |
1543 | nvme_rdma_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) | |
1544 | { | |
11fdf7f2 | 1545 | return nvme_fabric_ctrlr_get_reg_4(ctrlr, offset, value); |
7c673cae FG |
1546 | } |
1547 | ||
1548 | int | |
1549 | nvme_rdma_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) | |
1550 | { | |
11fdf7f2 | 1551 | return nvme_fabric_ctrlr_get_reg_8(ctrlr, offset, value); |
7c673cae FG |
1552 | } |
1553 | ||
1554 | int | |
1555 | nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair, | |
1556 | struct nvme_request *req) | |
1557 | { | |
1558 | struct nvme_rdma_qpair *rqpair; | |
1559 | struct spdk_nvme_rdma_req *rdma_req; | |
1560 | struct ibv_send_wr *wr, *bad_wr = NULL; | |
1561 | int rc; | |
1562 | ||
1563 | rqpair = nvme_rdma_qpair(qpair); | |
1564 | assert(rqpair != NULL); | |
1565 | assert(req != NULL); | |
1566 | ||
1567 | rdma_req = nvme_rdma_req_get(rqpair); | |
1568 | if (!rdma_req) { | |
1569 | /* | |
9f95a23c TL |
1570 | * No rdma_req is available, so queue the request to be |
1571 | * processed later. | |
7c673cae FG |
1572 | */ |
1573 | STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); | |
1574 | return 0; | |
1575 | } | |
1576 | ||
1577 | if (nvme_rdma_req_init(rqpair, req, rdma_req)) { | |
1578 | SPDK_ERRLOG("nvme_rdma_req_init() failed\n"); | |
1579 | nvme_rdma_req_put(rqpair, rdma_req); | |
1580 | return -1; | |
1581 | } | |
1582 | ||
1583 | wr = &rdma_req->send_wr; | |
1584 | ||
1585 | nvme_rdma_trace_ibv_sge(wr->sg_list); | |
1586 | ||
1587 | rc = ibv_post_send(rqpair->cm_id->qp, wr, &bad_wr); | |
1588 | if (rc) { | |
11fdf7f2 | 1589 | SPDK_ERRLOG("Failure posting rdma send for NVMf completion: %d (%s)\n", rc, spdk_strerror(rc)); |
7c673cae FG |
1590 | } |
1591 | ||
1592 | return rc; | |
1593 | } | |
1594 | ||
1595 | int | |
1596 | nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) | |
1597 | { | |
1598 | return nvme_rdma_qpair_destroy(qpair); | |
1599 | } | |
1600 | ||
1601 | int | |
9f95a23c | 1602 | nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) |
7c673cae FG |
1603 | { |
1604 | return nvme_rdma_qpair_connect(nvme_rdma_qpair(qpair)); | |
1605 | } | |
1606 | ||
9f95a23c TL |
1607 | void |
1608 | nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) | |
7c673cae | 1609 | { |
9f95a23c | 1610 | nvme_rdma_qpair_disconnect(qpair); |
7c673cae FG |
1611 | } |
1612 | ||
1613 | int | |
1614 | nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair) | |
1615 | { | |
1616 | /* Currently, doing nothing here */ | |
1617 | return 0; | |
1618 | } | |
1619 | ||
9f95a23c TL |
1620 | void |
1621 | nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) | |
7c673cae | 1622 | { |
9f95a23c TL |
1623 | struct spdk_nvme_rdma_req *rdma_req, *tmp; |
1624 | struct nvme_request *req; | |
1625 | struct spdk_nvme_cpl cpl; | |
1626 | struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); | |
1627 | ||
1628 | cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; | |
1629 | cpl.status.sct = SPDK_NVME_SCT_GENERIC; | |
1630 | cpl.status.dnr = dnr; | |
1631 | ||
1632 | TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { | |
1633 | assert(rdma_req->req != NULL); | |
1634 | req = rdma_req->req; | |
1635 | ||
1636 | nvme_rdma_req_complete(req, &cpl); | |
1637 | nvme_rdma_req_put(rqpair, rdma_req); | |
1638 | } | |
7c673cae FG |
1639 | } |
1640 | ||
11fdf7f2 TL |
1641 | static void |
1642 | nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair) | |
1643 | { | |
1644 | uint64_t t02; | |
1645 | struct spdk_nvme_rdma_req *rdma_req, *tmp; | |
1646 | struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); | |
1647 | struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; | |
1648 | struct spdk_nvme_ctrlr_process *active_proc; | |
1649 | ||
1650 | /* Don't check timeouts during controller initialization. */ | |
1651 | if (ctrlr->state != NVME_CTRLR_STATE_READY) { | |
1652 | return; | |
1653 | } | |
1654 | ||
1655 | if (nvme_qpair_is_admin_queue(qpair)) { | |
1656 | active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); | |
1657 | } else { | |
1658 | active_proc = qpair->active_proc; | |
1659 | } | |
1660 | ||
1661 | /* Only check timeouts if the current process has a timeout callback. */ | |
1662 | if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { | |
1663 | return; | |
1664 | } | |
1665 | ||
1666 | t02 = spdk_get_ticks(); | |
1667 | TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { | |
1668 | assert(rdma_req->req != NULL); | |
1669 | ||
1670 | if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) { | |
1671 | /* | |
1672 | * The requests are in order, so as soon as one has not timed out, | |
1673 | * stop iterating. | |
1674 | */ | |
1675 | break; | |
1676 | } | |
1677 | } | |
1678 | } | |
1679 | ||
7c673cae FG |
1680 | #define MAX_COMPLETIONS_PER_POLL 128 |
1681 | ||
1682 | int | |
1683 | nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, | |
1684 | uint32_t max_completions) | |
1685 | { | |
9f95a23c TL |
1686 | struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); |
1687 | struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL]; | |
1688 | int i, rc, batch_size; | |
1689 | uint32_t reaped; | |
1690 | struct ibv_cq *cq; | |
1691 | struct spdk_nvme_rdma_req *rdma_req; | |
7c673cae FG |
1692 | |
1693 | if (max_completions == 0) { | |
1694 | max_completions = rqpair->num_entries; | |
1695 | } else { | |
1696 | max_completions = spdk_min(max_completions, rqpair->num_entries); | |
1697 | } | |
1698 | ||
1699 | cq = rqpair->cq; | |
1700 | ||
1701 | reaped = 0; | |
1702 | do { | |
1703 | batch_size = spdk_min((max_completions - reaped), | |
1704 | MAX_COMPLETIONS_PER_POLL); | |
1705 | rc = ibv_poll_cq(cq, batch_size, wc); | |
1706 | if (rc < 0) { | |
1707 | SPDK_ERRLOG("Error polling CQ! (%d): %s\n", | |
11fdf7f2 | 1708 | errno, spdk_strerror(errno)); |
7c673cae FG |
1709 | return -1; |
1710 | } else if (rc == 0) { | |
1711 | /* Ran out of completions */ | |
1712 | break; | |
1713 | } | |
1714 | ||
1715 | for (i = 0; i < rc; i++) { | |
1716 | if (wc[i].status) { | |
1717 | SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", | |
1718 | qpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); | |
1719 | return -1; | |
1720 | } | |
1721 | ||
1722 | switch (wc[i].opcode) { | |
1723 | case IBV_WC_RECV: | |
11fdf7f2 | 1724 | SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n"); |
7c673cae FG |
1725 | |
1726 | reaped++; | |
1727 | ||
1728 | if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) { | |
1729 | SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len); | |
1730 | return -1; | |
1731 | } | |
1732 | ||
1733 | if (nvme_rdma_recv(rqpair, wc[i].wr_id)) { | |
1734 | SPDK_ERRLOG("nvme_rdma_recv processing failure\n"); | |
1735 | return -1; | |
1736 | } | |
1737 | break; | |
1738 | ||
1739 | case IBV_WC_SEND: | |
9f95a23c TL |
1740 | rdma_req = (struct spdk_nvme_rdma_req *)wc[i].wr_id; |
1741 | ||
1742 | if (rdma_req->request_ready_to_put) { | |
1743 | nvme_rdma_req_put(rqpair, rdma_req); | |
1744 | } else { | |
1745 | rdma_req->request_ready_to_put = true; | |
1746 | } | |
7c673cae FG |
1747 | break; |
1748 | ||
1749 | default: | |
1750 | SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", wc[i].opcode); | |
1751 | return -1; | |
1752 | } | |
1753 | } | |
1754 | } while (reaped < max_completions); | |
1755 | ||
11fdf7f2 TL |
1756 | if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) { |
1757 | nvme_rdma_qpair_check_timeout(qpair); | |
1758 | } | |
1759 | ||
7c673cae FG |
1760 | return reaped; |
1761 | } | |
1762 | ||
1763 | uint32_t | |
1764 | nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) | |
1765 | { | |
1766 | /* Todo, which should get from the NVMF target */ | |
1767 | return NVME_RDMA_RW_BUFFER_SIZE; | |
1768 | } | |
1769 | ||
11fdf7f2 TL |
1770 | uint16_t |
1771 | nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) | |
7c673cae | 1772 | { |
11fdf7f2 TL |
1773 | return spdk_min(ctrlr->cdata.nvmf_specific.msdbd, NVME_RDMA_MAX_SGL_DESCRIPTORS); |
1774 | } | |
1775 | ||
9f95a23c TL |
1776 | volatile struct spdk_nvme_registers * |
1777 | nvme_rdma_ctrlr_get_registers(struct spdk_nvme_ctrlr *ctrlr) | |
1778 | { | |
1779 | return NULL; | |
1780 | } | |
1781 | ||
11fdf7f2 TL |
1782 | void * |
1783 | nvme_rdma_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) | |
1784 | { | |
1785 | return NULL; | |
1786 | } | |
1787 | ||
1788 | int | |
1789 | nvme_rdma_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) | |
1790 | { | |
1791 | return 0; | |
7c673cae | 1792 | } |
9f95a23c TL |
1793 | |
1794 | void | |
1795 | nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) | |
1796 | { | |
1797 | struct spdk_nvme_rdma_req *rdma_req, *tmp; | |
1798 | struct nvme_request *req; | |
1799 | struct spdk_nvme_cpl cpl; | |
1800 | struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); | |
1801 | ||
1802 | cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; | |
1803 | cpl.status.sct = SPDK_NVME_SCT_GENERIC; | |
1804 | ||
1805 | TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { | |
1806 | if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { | |
1807 | continue; | |
1808 | } | |
1809 | assert(rdma_req->req != NULL); | |
1810 | req = rdma_req->req; | |
1811 | ||
1812 | nvme_rdma_req_complete(req, &cpl); | |
1813 | nvme_rdma_req_put(rqpair, rdma_req); | |
1814 | } | |
1815 | } | |
1816 | ||
1817 | void | |
1818 | spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) | |
1819 | { | |
1820 | g_nvme_hooks = *hooks; | |
1821 | } |