]> git.proxmox.com Git - mirror_ubuntu-disco-kernel.git/blob - drivers/nvme/target/rdma.c
Merge branch 'efi-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[mirror_ubuntu-disco-kernel.git] / drivers / nvme / target / rdma.c
1 /*
2 * NVMe over Fabrics RDMA target.
3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15 #include <linux/atomic.h>
16 #include <linux/ctype.h>
17 #include <linux/delay.h>
18 #include <linux/err.h>
19 #include <linux/init.h>
20 #include <linux/module.h>
21 #include <linux/nvme.h>
22 #include <linux/slab.h>
23 #include <linux/string.h>
24 #include <linux/wait.h>
25 #include <linux/inet.h>
26 #include <asm/unaligned.h>
27
28 #include <rdma/ib_verbs.h>
29 #include <rdma/rdma_cm.h>
30 #include <rdma/rw.h>
31
32 #include <linux/nvme-rdma.h>
33 #include "nvmet.h"
34
35 /*
36 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
37 */
38 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE
39 #define NVMET_RDMA_MAX_INLINE_SGE 4
40 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE)
41
42 struct nvmet_rdma_cmd {
43 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
44 struct ib_cqe cqe;
45 struct ib_recv_wr wr;
46 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
47 struct nvme_command *nvme_cmd;
48 struct nvmet_rdma_queue *queue;
49 };
50
51 enum {
52 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
53 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
54 };
55
56 struct nvmet_rdma_rsp {
57 struct ib_sge send_sge;
58 struct ib_cqe send_cqe;
59 struct ib_send_wr send_wr;
60
61 struct nvmet_rdma_cmd *cmd;
62 struct nvmet_rdma_queue *queue;
63
64 struct ib_cqe read_cqe;
65 struct rdma_rw_ctx rw;
66
67 struct nvmet_req req;
68
69 bool allocated;
70 u8 n_rdma;
71 u32 flags;
72 u32 invalidate_rkey;
73
74 struct list_head wait_list;
75 struct list_head free_list;
76 };
77
78 enum nvmet_rdma_queue_state {
79 NVMET_RDMA_Q_CONNECTING,
80 NVMET_RDMA_Q_LIVE,
81 NVMET_RDMA_Q_DISCONNECTING,
82 };
83
84 struct nvmet_rdma_queue {
85 struct rdma_cm_id *cm_id;
86 struct nvmet_port *port;
87 struct ib_cq *cq;
88 atomic_t sq_wr_avail;
89 struct nvmet_rdma_device *dev;
90 spinlock_t state_lock;
91 enum nvmet_rdma_queue_state state;
92 struct nvmet_cq nvme_cq;
93 struct nvmet_sq nvme_sq;
94
95 struct nvmet_rdma_rsp *rsps;
96 struct list_head free_rsps;
97 spinlock_t rsps_lock;
98 struct nvmet_rdma_cmd *cmds;
99
100 struct work_struct release_work;
101 struct list_head rsp_wait_list;
102 struct list_head rsp_wr_wait_list;
103 spinlock_t rsp_wr_wait_lock;
104
105 int idx;
106 int host_qid;
107 int recv_queue_size;
108 int send_queue_size;
109
110 struct list_head queue_list;
111 };
112
113 struct nvmet_rdma_device {
114 struct ib_device *device;
115 struct ib_pd *pd;
116 struct ib_srq *srq;
117 struct nvmet_rdma_cmd *srq_cmds;
118 size_t srq_size;
119 struct kref ref;
120 struct list_head entry;
121 int inline_data_size;
122 int inline_page_count;
123 };
124
125 static struct workqueue_struct *nvmet_rdma_delete_wq;
126 static bool nvmet_rdma_use_srq;
127 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
128 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
129
130 static DEFINE_IDA(nvmet_rdma_queue_ida);
131 static LIST_HEAD(nvmet_rdma_queue_list);
132 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
133
134 static LIST_HEAD(device_list);
135 static DEFINE_MUTEX(device_list_mutex);
136
137 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
138 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
139 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
140 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
141 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
142 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
143
144 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
145
146 static int num_pages(int len)
147 {
148 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
149 }
150
151 /* XXX: really should move to a generic header sooner or later.. */
152 static inline u32 get_unaligned_le24(const u8 *p)
153 {
154 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
155 }
156
157 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
158 {
159 return nvme_is_write(rsp->req.cmd) &&
160 rsp->req.transfer_len &&
161 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
162 }
163
164 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
165 {
166 return !nvme_is_write(rsp->req.cmd) &&
167 rsp->req.transfer_len &&
168 !rsp->req.rsp->status &&
169 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
170 }
171
172 static inline struct nvmet_rdma_rsp *
173 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
174 {
175 struct nvmet_rdma_rsp *rsp;
176 unsigned long flags;
177
178 spin_lock_irqsave(&queue->rsps_lock, flags);
179 rsp = list_first_entry_or_null(&queue->free_rsps,
180 struct nvmet_rdma_rsp, free_list);
181 if (likely(rsp))
182 list_del(&rsp->free_list);
183 spin_unlock_irqrestore(&queue->rsps_lock, flags);
184
185 if (unlikely(!rsp)) {
186 rsp = kmalloc(sizeof(*rsp), GFP_KERNEL);
187 if (unlikely(!rsp))
188 return NULL;
189 rsp->allocated = true;
190 }
191
192 return rsp;
193 }
194
195 static inline void
196 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
197 {
198 unsigned long flags;
199
200 if (rsp->allocated) {
201 kfree(rsp);
202 return;
203 }
204
205 spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
206 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
207 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
208 }
209
210 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
211 struct nvmet_rdma_cmd *c)
212 {
213 struct scatterlist *sg;
214 struct ib_sge *sge;
215 int i;
216
217 if (!ndev->inline_data_size)
218 return;
219
220 sg = c->inline_sg;
221 sge = &c->sge[1];
222
223 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
224 if (sge->length)
225 ib_dma_unmap_page(ndev->device, sge->addr,
226 sge->length, DMA_FROM_DEVICE);
227 if (sg_page(sg))
228 __free_page(sg_page(sg));
229 }
230 }
231
232 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
233 struct nvmet_rdma_cmd *c)
234 {
235 struct scatterlist *sg;
236 struct ib_sge *sge;
237 struct page *pg;
238 int len;
239 int i;
240
241 if (!ndev->inline_data_size)
242 return 0;
243
244 sg = c->inline_sg;
245 sg_init_table(sg, ndev->inline_page_count);
246 sge = &c->sge[1];
247 len = ndev->inline_data_size;
248
249 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
250 pg = alloc_page(GFP_KERNEL);
251 if (!pg)
252 goto out_err;
253 sg_assign_page(sg, pg);
254 sge->addr = ib_dma_map_page(ndev->device,
255 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
256 if (ib_dma_mapping_error(ndev->device, sge->addr))
257 goto out_err;
258 sge->length = min_t(int, len, PAGE_SIZE);
259 sge->lkey = ndev->pd->local_dma_lkey;
260 len -= sge->length;
261 }
262
263 return 0;
264 out_err:
265 for (; i >= 0; i--, sg--, sge--) {
266 if (sge->length)
267 ib_dma_unmap_page(ndev->device, sge->addr,
268 sge->length, DMA_FROM_DEVICE);
269 if (sg_page(sg))
270 __free_page(sg_page(sg));
271 }
272 return -ENOMEM;
273 }
274
275 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
276 struct nvmet_rdma_cmd *c, bool admin)
277 {
278 /* NVMe command / RDMA RECV */
279 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
280 if (!c->nvme_cmd)
281 goto out;
282
283 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
284 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
285 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
286 goto out_free_cmd;
287
288 c->sge[0].length = sizeof(*c->nvme_cmd);
289 c->sge[0].lkey = ndev->pd->local_dma_lkey;
290
291 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
292 goto out_unmap_cmd;
293
294 c->cqe.done = nvmet_rdma_recv_done;
295
296 c->wr.wr_cqe = &c->cqe;
297 c->wr.sg_list = c->sge;
298 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
299
300 return 0;
301
302 out_unmap_cmd:
303 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
304 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
305 out_free_cmd:
306 kfree(c->nvme_cmd);
307
308 out:
309 return -ENOMEM;
310 }
311
312 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
313 struct nvmet_rdma_cmd *c, bool admin)
314 {
315 if (!admin)
316 nvmet_rdma_free_inline_pages(ndev, c);
317 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
318 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
319 kfree(c->nvme_cmd);
320 }
321
322 static struct nvmet_rdma_cmd *
323 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
324 int nr_cmds, bool admin)
325 {
326 struct nvmet_rdma_cmd *cmds;
327 int ret = -EINVAL, i;
328
329 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
330 if (!cmds)
331 goto out;
332
333 for (i = 0; i < nr_cmds; i++) {
334 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
335 if (ret)
336 goto out_free;
337 }
338
339 return cmds;
340
341 out_free:
342 while (--i >= 0)
343 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
344 kfree(cmds);
345 out:
346 return ERR_PTR(ret);
347 }
348
349 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
350 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
351 {
352 int i;
353
354 for (i = 0; i < nr_cmds; i++)
355 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
356 kfree(cmds);
357 }
358
359 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
360 struct nvmet_rdma_rsp *r)
361 {
362 /* NVMe CQE / RDMA SEND */
363 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
364 if (!r->req.rsp)
365 goto out;
366
367 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
368 sizeof(*r->req.rsp), DMA_TO_DEVICE);
369 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
370 goto out_free_rsp;
371
372 r->send_sge.length = sizeof(*r->req.rsp);
373 r->send_sge.lkey = ndev->pd->local_dma_lkey;
374
375 r->send_cqe.done = nvmet_rdma_send_done;
376
377 r->send_wr.wr_cqe = &r->send_cqe;
378 r->send_wr.sg_list = &r->send_sge;
379 r->send_wr.num_sge = 1;
380 r->send_wr.send_flags = IB_SEND_SIGNALED;
381
382 /* Data In / RDMA READ */
383 r->read_cqe.done = nvmet_rdma_read_data_done;
384 return 0;
385
386 out_free_rsp:
387 kfree(r->req.rsp);
388 out:
389 return -ENOMEM;
390 }
391
392 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
393 struct nvmet_rdma_rsp *r)
394 {
395 ib_dma_unmap_single(ndev->device, r->send_sge.addr,
396 sizeof(*r->req.rsp), DMA_TO_DEVICE);
397 kfree(r->req.rsp);
398 }
399
400 static int
401 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
402 {
403 struct nvmet_rdma_device *ndev = queue->dev;
404 int nr_rsps = queue->recv_queue_size * 2;
405 int ret = -EINVAL, i;
406
407 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
408 GFP_KERNEL);
409 if (!queue->rsps)
410 goto out;
411
412 for (i = 0; i < nr_rsps; i++) {
413 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
414
415 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
416 if (ret)
417 goto out_free;
418
419 list_add_tail(&rsp->free_list, &queue->free_rsps);
420 }
421
422 return 0;
423
424 out_free:
425 while (--i >= 0) {
426 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
427
428 list_del(&rsp->free_list);
429 nvmet_rdma_free_rsp(ndev, rsp);
430 }
431 kfree(queue->rsps);
432 out:
433 return ret;
434 }
435
436 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
437 {
438 struct nvmet_rdma_device *ndev = queue->dev;
439 int i, nr_rsps = queue->recv_queue_size * 2;
440
441 for (i = 0; i < nr_rsps; i++) {
442 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
443
444 list_del(&rsp->free_list);
445 nvmet_rdma_free_rsp(ndev, rsp);
446 }
447 kfree(queue->rsps);
448 }
449
450 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
451 struct nvmet_rdma_cmd *cmd)
452 {
453 int ret;
454
455 ib_dma_sync_single_for_device(ndev->device,
456 cmd->sge[0].addr, cmd->sge[0].length,
457 DMA_FROM_DEVICE);
458
459 if (ndev->srq)
460 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
461 else
462 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
463
464 if (unlikely(ret))
465 pr_err("post_recv cmd failed\n");
466
467 return ret;
468 }
469
470 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
471 {
472 spin_lock(&queue->rsp_wr_wait_lock);
473 while (!list_empty(&queue->rsp_wr_wait_list)) {
474 struct nvmet_rdma_rsp *rsp;
475 bool ret;
476
477 rsp = list_entry(queue->rsp_wr_wait_list.next,
478 struct nvmet_rdma_rsp, wait_list);
479 list_del(&rsp->wait_list);
480
481 spin_unlock(&queue->rsp_wr_wait_lock);
482 ret = nvmet_rdma_execute_command(rsp);
483 spin_lock(&queue->rsp_wr_wait_lock);
484
485 if (!ret) {
486 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
487 break;
488 }
489 }
490 spin_unlock(&queue->rsp_wr_wait_lock);
491 }
492
493
494 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
495 {
496 struct nvmet_rdma_queue *queue = rsp->queue;
497
498 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
499
500 if (rsp->n_rdma) {
501 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
502 queue->cm_id->port_num, rsp->req.sg,
503 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
504 }
505
506 if (rsp->req.sg != rsp->cmd->inline_sg)
507 sgl_free(rsp->req.sg);
508
509 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
510 nvmet_rdma_process_wr_wait_list(queue);
511
512 nvmet_rdma_put_rsp(rsp);
513 }
514
515 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
516 {
517 if (queue->nvme_sq.ctrl) {
518 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
519 } else {
520 /*
521 * we didn't setup the controller yet in case
522 * of admin connect error, just disconnect and
523 * cleanup the queue
524 */
525 nvmet_rdma_queue_disconnect(queue);
526 }
527 }
528
529 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
530 {
531 struct nvmet_rdma_rsp *rsp =
532 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
533
534 nvmet_rdma_release_rsp(rsp);
535
536 if (unlikely(wc->status != IB_WC_SUCCESS &&
537 wc->status != IB_WC_WR_FLUSH_ERR)) {
538 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
539 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
540 nvmet_rdma_error_comp(rsp->queue);
541 }
542 }
543
544 static void nvmet_rdma_queue_response(struct nvmet_req *req)
545 {
546 struct nvmet_rdma_rsp *rsp =
547 container_of(req, struct nvmet_rdma_rsp, req);
548 struct rdma_cm_id *cm_id = rsp->queue->cm_id;
549 struct ib_send_wr *first_wr;
550
551 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
552 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
553 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
554 } else {
555 rsp->send_wr.opcode = IB_WR_SEND;
556 }
557
558 if (nvmet_rdma_need_data_out(rsp))
559 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
560 cm_id->port_num, NULL, &rsp->send_wr);
561 else
562 first_wr = &rsp->send_wr;
563
564 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
565
566 ib_dma_sync_single_for_device(rsp->queue->dev->device,
567 rsp->send_sge.addr, rsp->send_sge.length,
568 DMA_TO_DEVICE);
569
570 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
571 pr_err("sending cmd response failed\n");
572 nvmet_rdma_release_rsp(rsp);
573 }
574 }
575
576 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
577 {
578 struct nvmet_rdma_rsp *rsp =
579 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
580 struct nvmet_rdma_queue *queue = cq->cq_context;
581
582 WARN_ON(rsp->n_rdma <= 0);
583 atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
584 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
585 queue->cm_id->port_num, rsp->req.sg,
586 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
587 rsp->n_rdma = 0;
588
589 if (unlikely(wc->status != IB_WC_SUCCESS)) {
590 nvmet_req_uninit(&rsp->req);
591 nvmet_rdma_release_rsp(rsp);
592 if (wc->status != IB_WC_WR_FLUSH_ERR) {
593 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
594 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
595 nvmet_rdma_error_comp(queue);
596 }
597 return;
598 }
599
600 nvmet_req_execute(&rsp->req);
601 }
602
603 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
604 u64 off)
605 {
606 int sg_count = num_pages(len);
607 struct scatterlist *sg;
608 int i;
609
610 sg = rsp->cmd->inline_sg;
611 for (i = 0; i < sg_count; i++, sg++) {
612 if (i < sg_count - 1)
613 sg_unmark_end(sg);
614 else
615 sg_mark_end(sg);
616 sg->offset = off;
617 sg->length = min_t(int, len, PAGE_SIZE - off);
618 len -= sg->length;
619 if (!i)
620 off = 0;
621 }
622
623 rsp->req.sg = rsp->cmd->inline_sg;
624 rsp->req.sg_cnt = sg_count;
625 }
626
627 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
628 {
629 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
630 u64 off = le64_to_cpu(sgl->addr);
631 u32 len = le32_to_cpu(sgl->length);
632
633 if (!nvme_is_write(rsp->req.cmd))
634 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
635
636 if (off + len > rsp->queue->dev->inline_data_size) {
637 pr_err("invalid inline data offset!\n");
638 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
639 }
640
641 /* no data command? */
642 if (!len)
643 return 0;
644
645 nvmet_rdma_use_inline_sg(rsp, len, off);
646 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
647 rsp->req.transfer_len += len;
648 return 0;
649 }
650
651 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
652 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
653 {
654 struct rdma_cm_id *cm_id = rsp->queue->cm_id;
655 u64 addr = le64_to_cpu(sgl->addr);
656 u32 len = get_unaligned_le24(sgl->length);
657 u32 key = get_unaligned_le32(sgl->key);
658 int ret;
659
660 /* no data command? */
661 if (!len)
662 return 0;
663
664 rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
665 if (!rsp->req.sg)
666 return NVME_SC_INTERNAL;
667
668 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
669 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
670 nvmet_data_dir(&rsp->req));
671 if (ret < 0)
672 return NVME_SC_INTERNAL;
673 rsp->req.transfer_len += len;
674 rsp->n_rdma += ret;
675
676 if (invalidate) {
677 rsp->invalidate_rkey = key;
678 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
679 }
680
681 return 0;
682 }
683
684 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
685 {
686 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
687
688 switch (sgl->type >> 4) {
689 case NVME_SGL_FMT_DATA_DESC:
690 switch (sgl->type & 0xf) {
691 case NVME_SGL_FMT_OFFSET:
692 return nvmet_rdma_map_sgl_inline(rsp);
693 default:
694 pr_err("invalid SGL subtype: %#x\n", sgl->type);
695 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
696 }
697 case NVME_KEY_SGL_FMT_DATA_DESC:
698 switch (sgl->type & 0xf) {
699 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
700 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
701 case NVME_SGL_FMT_ADDRESS:
702 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
703 default:
704 pr_err("invalid SGL subtype: %#x\n", sgl->type);
705 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
706 }
707 default:
708 pr_err("invalid SGL type: %#x\n", sgl->type);
709 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
710 }
711 }
712
713 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
714 {
715 struct nvmet_rdma_queue *queue = rsp->queue;
716
717 if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
718 &queue->sq_wr_avail) < 0)) {
719 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
720 1 + rsp->n_rdma, queue->idx,
721 queue->nvme_sq.ctrl->cntlid);
722 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
723 return false;
724 }
725
726 if (nvmet_rdma_need_data_in(rsp)) {
727 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
728 queue->cm_id->port_num, &rsp->read_cqe, NULL))
729 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
730 } else {
731 nvmet_req_execute(&rsp->req);
732 }
733
734 return true;
735 }
736
737 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
738 struct nvmet_rdma_rsp *cmd)
739 {
740 u16 status;
741
742 ib_dma_sync_single_for_cpu(queue->dev->device,
743 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
744 DMA_FROM_DEVICE);
745 ib_dma_sync_single_for_cpu(queue->dev->device,
746 cmd->send_sge.addr, cmd->send_sge.length,
747 DMA_TO_DEVICE);
748
749 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
750 &queue->nvme_sq, &nvmet_rdma_ops))
751 return;
752
753 status = nvmet_rdma_map_sgl(cmd);
754 if (status)
755 goto out_err;
756
757 if (unlikely(!nvmet_rdma_execute_command(cmd))) {
758 spin_lock(&queue->rsp_wr_wait_lock);
759 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
760 spin_unlock(&queue->rsp_wr_wait_lock);
761 }
762
763 return;
764
765 out_err:
766 nvmet_req_complete(&cmd->req, status);
767 }
768
769 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
770 {
771 struct nvmet_rdma_cmd *cmd =
772 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
773 struct nvmet_rdma_queue *queue = cq->cq_context;
774 struct nvmet_rdma_rsp *rsp;
775
776 if (unlikely(wc->status != IB_WC_SUCCESS)) {
777 if (wc->status != IB_WC_WR_FLUSH_ERR) {
778 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
779 wc->wr_cqe, ib_wc_status_msg(wc->status),
780 wc->status);
781 nvmet_rdma_error_comp(queue);
782 }
783 return;
784 }
785
786 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
787 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
788 nvmet_rdma_error_comp(queue);
789 return;
790 }
791
792 cmd->queue = queue;
793 rsp = nvmet_rdma_get_rsp(queue);
794 if (unlikely(!rsp)) {
795 /*
796 * we get here only under memory pressure,
797 * silently drop and have the host retry
798 * as we can't even fail it.
799 */
800 nvmet_rdma_post_recv(queue->dev, cmd);
801 return;
802 }
803 rsp->queue = queue;
804 rsp->cmd = cmd;
805 rsp->flags = 0;
806 rsp->req.cmd = cmd->nvme_cmd;
807 rsp->req.port = queue->port;
808 rsp->n_rdma = 0;
809
810 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
811 unsigned long flags;
812
813 spin_lock_irqsave(&queue->state_lock, flags);
814 if (queue->state == NVMET_RDMA_Q_CONNECTING)
815 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
816 else
817 nvmet_rdma_put_rsp(rsp);
818 spin_unlock_irqrestore(&queue->state_lock, flags);
819 return;
820 }
821
822 nvmet_rdma_handle_command(queue, rsp);
823 }
824
825 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
826 {
827 if (!ndev->srq)
828 return;
829
830 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
831 ib_destroy_srq(ndev->srq);
832 }
833
834 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
835 {
836 struct ib_srq_init_attr srq_attr = { NULL, };
837 struct ib_srq *srq;
838 size_t srq_size;
839 int ret, i;
840
841 srq_size = 4095; /* XXX: tune */
842
843 srq_attr.attr.max_wr = srq_size;
844 srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
845 srq_attr.attr.srq_limit = 0;
846 srq_attr.srq_type = IB_SRQT_BASIC;
847 srq = ib_create_srq(ndev->pd, &srq_attr);
848 if (IS_ERR(srq)) {
849 /*
850 * If SRQs aren't supported we just go ahead and use normal
851 * non-shared receive queues.
852 */
853 pr_info("SRQ requested but not supported.\n");
854 return 0;
855 }
856
857 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
858 if (IS_ERR(ndev->srq_cmds)) {
859 ret = PTR_ERR(ndev->srq_cmds);
860 goto out_destroy_srq;
861 }
862
863 ndev->srq = srq;
864 ndev->srq_size = srq_size;
865
866 for (i = 0; i < srq_size; i++) {
867 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
868 if (ret)
869 goto out_free_cmds;
870 }
871
872 return 0;
873
874 out_free_cmds:
875 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
876 out_destroy_srq:
877 ib_destroy_srq(srq);
878 return ret;
879 }
880
881 static void nvmet_rdma_free_dev(struct kref *ref)
882 {
883 struct nvmet_rdma_device *ndev =
884 container_of(ref, struct nvmet_rdma_device, ref);
885
886 mutex_lock(&device_list_mutex);
887 list_del(&ndev->entry);
888 mutex_unlock(&device_list_mutex);
889
890 nvmet_rdma_destroy_srq(ndev);
891 ib_dealloc_pd(ndev->pd);
892
893 kfree(ndev);
894 }
895
896 static struct nvmet_rdma_device *
897 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
898 {
899 struct nvmet_port *port = cm_id->context;
900 struct nvmet_rdma_device *ndev;
901 int inline_page_count;
902 int inline_sge_count;
903 int ret;
904
905 mutex_lock(&device_list_mutex);
906 list_for_each_entry(ndev, &device_list, entry) {
907 if (ndev->device->node_guid == cm_id->device->node_guid &&
908 kref_get_unless_zero(&ndev->ref))
909 goto out_unlock;
910 }
911
912 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
913 if (!ndev)
914 goto out_err;
915
916 inline_page_count = num_pages(port->inline_data_size);
917 inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
918 cm_id->device->attrs.max_recv_sge) - 1;
919 if (inline_page_count > inline_sge_count) {
920 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
921 port->inline_data_size, cm_id->device->name,
922 inline_sge_count * PAGE_SIZE);
923 port->inline_data_size = inline_sge_count * PAGE_SIZE;
924 inline_page_count = inline_sge_count;
925 }
926 ndev->inline_data_size = port->inline_data_size;
927 ndev->inline_page_count = inline_page_count;
928 ndev->device = cm_id->device;
929 kref_init(&ndev->ref);
930
931 ndev->pd = ib_alloc_pd(ndev->device, 0);
932 if (IS_ERR(ndev->pd))
933 goto out_free_dev;
934
935 if (nvmet_rdma_use_srq) {
936 ret = nvmet_rdma_init_srq(ndev);
937 if (ret)
938 goto out_free_pd;
939 }
940
941 list_add(&ndev->entry, &device_list);
942 out_unlock:
943 mutex_unlock(&device_list_mutex);
944 pr_debug("added %s.\n", ndev->device->name);
945 return ndev;
946
947 out_free_pd:
948 ib_dealloc_pd(ndev->pd);
949 out_free_dev:
950 kfree(ndev);
951 out_err:
952 mutex_unlock(&device_list_mutex);
953 return NULL;
954 }
955
956 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
957 {
958 struct ib_qp_init_attr qp_attr;
959 struct nvmet_rdma_device *ndev = queue->dev;
960 int comp_vector, nr_cqe, ret, i;
961
962 /*
963 * Spread the io queues across completion vectors,
964 * but still keep all admin queues on vector 0.
965 */
966 comp_vector = !queue->host_qid ? 0 :
967 queue->idx % ndev->device->num_comp_vectors;
968
969 /*
970 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
971 */
972 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
973
974 queue->cq = ib_alloc_cq(ndev->device, queue,
975 nr_cqe + 1, comp_vector,
976 IB_POLL_WORKQUEUE);
977 if (IS_ERR(queue->cq)) {
978 ret = PTR_ERR(queue->cq);
979 pr_err("failed to create CQ cqe= %d ret= %d\n",
980 nr_cqe + 1, ret);
981 goto out;
982 }
983
984 memset(&qp_attr, 0, sizeof(qp_attr));
985 qp_attr.qp_context = queue;
986 qp_attr.event_handler = nvmet_rdma_qp_event;
987 qp_attr.send_cq = queue->cq;
988 qp_attr.recv_cq = queue->cq;
989 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
990 qp_attr.qp_type = IB_QPT_RC;
991 /* +1 for drain */
992 qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
993 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
994 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
995 ndev->device->attrs.max_send_sge);
996
997 if (ndev->srq) {
998 qp_attr.srq = ndev->srq;
999 } else {
1000 /* +1 for drain */
1001 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1002 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1003 }
1004
1005 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1006 if (ret) {
1007 pr_err("failed to create_qp ret= %d\n", ret);
1008 goto err_destroy_cq;
1009 }
1010
1011 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1012
1013 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1014 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1015 qp_attr.cap.max_send_wr, queue->cm_id);
1016
1017 if (!ndev->srq) {
1018 for (i = 0; i < queue->recv_queue_size; i++) {
1019 queue->cmds[i].queue = queue;
1020 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1021 if (ret)
1022 goto err_destroy_qp;
1023 }
1024 }
1025
1026 out:
1027 return ret;
1028
1029 err_destroy_qp:
1030 rdma_destroy_qp(queue->cm_id);
1031 err_destroy_cq:
1032 ib_free_cq(queue->cq);
1033 goto out;
1034 }
1035
1036 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1037 {
1038 struct ib_qp *qp = queue->cm_id->qp;
1039
1040 ib_drain_qp(qp);
1041 rdma_destroy_id(queue->cm_id);
1042 ib_destroy_qp(qp);
1043 ib_free_cq(queue->cq);
1044 }
1045
1046 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1047 {
1048 pr_debug("freeing queue %d\n", queue->idx);
1049
1050 nvmet_sq_destroy(&queue->nvme_sq);
1051
1052 nvmet_rdma_destroy_queue_ib(queue);
1053 if (!queue->dev->srq) {
1054 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1055 queue->recv_queue_size,
1056 !queue->host_qid);
1057 }
1058 nvmet_rdma_free_rsps(queue);
1059 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1060 kfree(queue);
1061 }
1062
1063 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1064 {
1065 struct nvmet_rdma_queue *queue =
1066 container_of(w, struct nvmet_rdma_queue, release_work);
1067 struct nvmet_rdma_device *dev = queue->dev;
1068
1069 nvmet_rdma_free_queue(queue);
1070
1071 kref_put(&dev->ref, nvmet_rdma_free_dev);
1072 }
1073
1074 static int
1075 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1076 struct nvmet_rdma_queue *queue)
1077 {
1078 struct nvme_rdma_cm_req *req;
1079
1080 req = (struct nvme_rdma_cm_req *)conn->private_data;
1081 if (!req || conn->private_data_len == 0)
1082 return NVME_RDMA_CM_INVALID_LEN;
1083
1084 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1085 return NVME_RDMA_CM_INVALID_RECFMT;
1086
1087 queue->host_qid = le16_to_cpu(req->qid);
1088
1089 /*
1090 * req->hsqsize corresponds to our recv queue size plus 1
1091 * req->hrqsize corresponds to our send queue size
1092 */
1093 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1094 queue->send_queue_size = le16_to_cpu(req->hrqsize);
1095
1096 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1097 return NVME_RDMA_CM_INVALID_HSQSIZE;
1098
1099 /* XXX: Should we enforce some kind of max for IO queues? */
1100
1101 return 0;
1102 }
1103
1104 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1105 enum nvme_rdma_cm_status status)
1106 {
1107 struct nvme_rdma_cm_rej rej;
1108
1109 pr_debug("rejecting connect request: status %d (%s)\n",
1110 status, nvme_rdma_cm_msg(status));
1111
1112 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1113 rej.sts = cpu_to_le16(status);
1114
1115 return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1116 }
1117
1118 static struct nvmet_rdma_queue *
1119 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1120 struct rdma_cm_id *cm_id,
1121 struct rdma_cm_event *event)
1122 {
1123 struct nvmet_rdma_queue *queue;
1124 int ret;
1125
1126 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1127 if (!queue) {
1128 ret = NVME_RDMA_CM_NO_RSC;
1129 goto out_reject;
1130 }
1131
1132 ret = nvmet_sq_init(&queue->nvme_sq);
1133 if (ret) {
1134 ret = NVME_RDMA_CM_NO_RSC;
1135 goto out_free_queue;
1136 }
1137
1138 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1139 if (ret)
1140 goto out_destroy_sq;
1141
1142 /*
1143 * Schedules the actual release because calling rdma_destroy_id from
1144 * inside a CM callback would trigger a deadlock. (great API design..)
1145 */
1146 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1147 queue->dev = ndev;
1148 queue->cm_id = cm_id;
1149
1150 spin_lock_init(&queue->state_lock);
1151 queue->state = NVMET_RDMA_Q_CONNECTING;
1152 INIT_LIST_HEAD(&queue->rsp_wait_list);
1153 INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1154 spin_lock_init(&queue->rsp_wr_wait_lock);
1155 INIT_LIST_HEAD(&queue->free_rsps);
1156 spin_lock_init(&queue->rsps_lock);
1157 INIT_LIST_HEAD(&queue->queue_list);
1158
1159 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1160 if (queue->idx < 0) {
1161 ret = NVME_RDMA_CM_NO_RSC;
1162 goto out_destroy_sq;
1163 }
1164
1165 ret = nvmet_rdma_alloc_rsps(queue);
1166 if (ret) {
1167 ret = NVME_RDMA_CM_NO_RSC;
1168 goto out_ida_remove;
1169 }
1170
1171 if (!ndev->srq) {
1172 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1173 queue->recv_queue_size,
1174 !queue->host_qid);
1175 if (IS_ERR(queue->cmds)) {
1176 ret = NVME_RDMA_CM_NO_RSC;
1177 goto out_free_responses;
1178 }
1179 }
1180
1181 ret = nvmet_rdma_create_queue_ib(queue);
1182 if (ret) {
1183 pr_err("%s: creating RDMA queue failed (%d).\n",
1184 __func__, ret);
1185 ret = NVME_RDMA_CM_NO_RSC;
1186 goto out_free_cmds;
1187 }
1188
1189 return queue;
1190
1191 out_free_cmds:
1192 if (!ndev->srq) {
1193 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1194 queue->recv_queue_size,
1195 !queue->host_qid);
1196 }
1197 out_free_responses:
1198 nvmet_rdma_free_rsps(queue);
1199 out_ida_remove:
1200 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1201 out_destroy_sq:
1202 nvmet_sq_destroy(&queue->nvme_sq);
1203 out_free_queue:
1204 kfree(queue);
1205 out_reject:
1206 nvmet_rdma_cm_reject(cm_id, ret);
1207 return NULL;
1208 }
1209
1210 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1211 {
1212 struct nvmet_rdma_queue *queue = priv;
1213
1214 switch (event->event) {
1215 case IB_EVENT_COMM_EST:
1216 rdma_notify(queue->cm_id, event->event);
1217 break;
1218 default:
1219 pr_err("received IB QP event: %s (%d)\n",
1220 ib_event_msg(event->event), event->event);
1221 break;
1222 }
1223 }
1224
1225 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1226 struct nvmet_rdma_queue *queue,
1227 struct rdma_conn_param *p)
1228 {
1229 struct rdma_conn_param param = { };
1230 struct nvme_rdma_cm_rep priv = { };
1231 int ret = -ENOMEM;
1232
1233 param.rnr_retry_count = 7;
1234 param.flow_control = 1;
1235 param.initiator_depth = min_t(u8, p->initiator_depth,
1236 queue->dev->device->attrs.max_qp_init_rd_atom);
1237 param.private_data = &priv;
1238 param.private_data_len = sizeof(priv);
1239 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1240 priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1241
1242 ret = rdma_accept(cm_id, &param);
1243 if (ret)
1244 pr_err("rdma_accept failed (error code = %d)\n", ret);
1245
1246 return ret;
1247 }
1248
1249 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1250 struct rdma_cm_event *event)
1251 {
1252 struct nvmet_rdma_device *ndev;
1253 struct nvmet_rdma_queue *queue;
1254 int ret = -EINVAL;
1255
1256 ndev = nvmet_rdma_find_get_device(cm_id);
1257 if (!ndev) {
1258 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1259 return -ECONNREFUSED;
1260 }
1261
1262 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1263 if (!queue) {
1264 ret = -ENOMEM;
1265 goto put_device;
1266 }
1267 queue->port = cm_id->context;
1268
1269 if (queue->host_qid == 0) {
1270 /* Let inflight controller teardown complete */
1271 flush_workqueue(nvmet_rdma_delete_wq);
1272 }
1273
1274 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1275 if (ret) {
1276 queue_work(nvmet_rdma_delete_wq, &queue->release_work);
1277 /* Destroying rdma_cm id is not needed here */
1278 return 0;
1279 }
1280
1281 mutex_lock(&nvmet_rdma_queue_mutex);
1282 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1283 mutex_unlock(&nvmet_rdma_queue_mutex);
1284
1285 return 0;
1286
1287 put_device:
1288 kref_put(&ndev->ref, nvmet_rdma_free_dev);
1289
1290 return ret;
1291 }
1292
1293 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1294 {
1295 unsigned long flags;
1296
1297 spin_lock_irqsave(&queue->state_lock, flags);
1298 if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1299 pr_warn("trying to establish a connected queue\n");
1300 goto out_unlock;
1301 }
1302 queue->state = NVMET_RDMA_Q_LIVE;
1303
1304 while (!list_empty(&queue->rsp_wait_list)) {
1305 struct nvmet_rdma_rsp *cmd;
1306
1307 cmd = list_first_entry(&queue->rsp_wait_list,
1308 struct nvmet_rdma_rsp, wait_list);
1309 list_del(&cmd->wait_list);
1310
1311 spin_unlock_irqrestore(&queue->state_lock, flags);
1312 nvmet_rdma_handle_command(queue, cmd);
1313 spin_lock_irqsave(&queue->state_lock, flags);
1314 }
1315
1316 out_unlock:
1317 spin_unlock_irqrestore(&queue->state_lock, flags);
1318 }
1319
1320 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1321 {
1322 bool disconnect = false;
1323 unsigned long flags;
1324
1325 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1326
1327 spin_lock_irqsave(&queue->state_lock, flags);
1328 switch (queue->state) {
1329 case NVMET_RDMA_Q_CONNECTING:
1330 case NVMET_RDMA_Q_LIVE:
1331 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1332 disconnect = true;
1333 break;
1334 case NVMET_RDMA_Q_DISCONNECTING:
1335 break;
1336 }
1337 spin_unlock_irqrestore(&queue->state_lock, flags);
1338
1339 if (disconnect) {
1340 rdma_disconnect(queue->cm_id);
1341 queue_work(nvmet_rdma_delete_wq, &queue->release_work);
1342 }
1343 }
1344
1345 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1346 {
1347 bool disconnect = false;
1348
1349 mutex_lock(&nvmet_rdma_queue_mutex);
1350 if (!list_empty(&queue->queue_list)) {
1351 list_del_init(&queue->queue_list);
1352 disconnect = true;
1353 }
1354 mutex_unlock(&nvmet_rdma_queue_mutex);
1355
1356 if (disconnect)
1357 __nvmet_rdma_queue_disconnect(queue);
1358 }
1359
1360 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1361 struct nvmet_rdma_queue *queue)
1362 {
1363 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1364
1365 mutex_lock(&nvmet_rdma_queue_mutex);
1366 if (!list_empty(&queue->queue_list))
1367 list_del_init(&queue->queue_list);
1368 mutex_unlock(&nvmet_rdma_queue_mutex);
1369
1370 pr_err("failed to connect queue %d\n", queue->idx);
1371 queue_work(nvmet_rdma_delete_wq, &queue->release_work);
1372 }
1373
1374 /**
1375 * nvme_rdma_device_removal() - Handle RDMA device removal
1376 * @cm_id: rdma_cm id, used for nvmet port
1377 * @queue: nvmet rdma queue (cm id qp_context)
1378 *
1379 * DEVICE_REMOVAL event notifies us that the RDMA device is about
1380 * to unplug. Note that this event can be generated on a normal
1381 * queue cm_id and/or a device bound listener cm_id (where in this
1382 * case queue will be null).
1383 *
1384 * We registered an ib_client to handle device removal for queues,
1385 * so we only need to handle the listening port cm_ids. In this case
1386 * we nullify the priv to prevent double cm_id destruction and destroying
1387 * the cm_id implicitely by returning a non-zero rc to the callout.
1388 */
1389 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1390 struct nvmet_rdma_queue *queue)
1391 {
1392 struct nvmet_port *port;
1393
1394 if (queue) {
1395 /*
1396 * This is a queue cm_id. we have registered
1397 * an ib_client to handle queues removal
1398 * so don't interfear and just return.
1399 */
1400 return 0;
1401 }
1402
1403 port = cm_id->context;
1404
1405 /*
1406 * This is a listener cm_id. Make sure that
1407 * future remove_port won't invoke a double
1408 * cm_id destroy. use atomic xchg to make sure
1409 * we don't compete with remove_port.
1410 */
1411 if (xchg(&port->priv, NULL) != cm_id)
1412 return 0;
1413
1414 /*
1415 * We need to return 1 so that the core will destroy
1416 * it's own ID. What a great API design..
1417 */
1418 return 1;
1419 }
1420
1421 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1422 struct rdma_cm_event *event)
1423 {
1424 struct nvmet_rdma_queue *queue = NULL;
1425 int ret = 0;
1426
1427 if (cm_id->qp)
1428 queue = cm_id->qp->qp_context;
1429
1430 pr_debug("%s (%d): status %d id %p\n",
1431 rdma_event_msg(event->event), event->event,
1432 event->status, cm_id);
1433
1434 switch (event->event) {
1435 case RDMA_CM_EVENT_CONNECT_REQUEST:
1436 ret = nvmet_rdma_queue_connect(cm_id, event);
1437 break;
1438 case RDMA_CM_EVENT_ESTABLISHED:
1439 nvmet_rdma_queue_established(queue);
1440 break;
1441 case RDMA_CM_EVENT_ADDR_CHANGE:
1442 case RDMA_CM_EVENT_DISCONNECTED:
1443 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1444 nvmet_rdma_queue_disconnect(queue);
1445 break;
1446 case RDMA_CM_EVENT_DEVICE_REMOVAL:
1447 ret = nvmet_rdma_device_removal(cm_id, queue);
1448 break;
1449 case RDMA_CM_EVENT_REJECTED:
1450 pr_debug("Connection rejected: %s\n",
1451 rdma_reject_msg(cm_id, event->status));
1452 /* FALLTHROUGH */
1453 case RDMA_CM_EVENT_UNREACHABLE:
1454 case RDMA_CM_EVENT_CONNECT_ERROR:
1455 nvmet_rdma_queue_connect_fail(cm_id, queue);
1456 break;
1457 default:
1458 pr_err("received unrecognized RDMA CM event %d\n",
1459 event->event);
1460 break;
1461 }
1462
1463 return ret;
1464 }
1465
1466 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1467 {
1468 struct nvmet_rdma_queue *queue;
1469
1470 restart:
1471 mutex_lock(&nvmet_rdma_queue_mutex);
1472 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1473 if (queue->nvme_sq.ctrl == ctrl) {
1474 list_del_init(&queue->queue_list);
1475 mutex_unlock(&nvmet_rdma_queue_mutex);
1476
1477 __nvmet_rdma_queue_disconnect(queue);
1478 goto restart;
1479 }
1480 }
1481 mutex_unlock(&nvmet_rdma_queue_mutex);
1482 }
1483
1484 static int nvmet_rdma_add_port(struct nvmet_port *port)
1485 {
1486 struct rdma_cm_id *cm_id;
1487 struct sockaddr_storage addr = { };
1488 __kernel_sa_family_t af;
1489 int ret;
1490
1491 switch (port->disc_addr.adrfam) {
1492 case NVMF_ADDR_FAMILY_IP4:
1493 af = AF_INET;
1494 break;
1495 case NVMF_ADDR_FAMILY_IP6:
1496 af = AF_INET6;
1497 break;
1498 default:
1499 pr_err("address family %d not supported\n",
1500 port->disc_addr.adrfam);
1501 return -EINVAL;
1502 }
1503
1504 if (port->inline_data_size < 0) {
1505 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1506 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1507 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1508 port->inline_data_size,
1509 NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1510 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1511 }
1512
1513 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
1514 port->disc_addr.trsvcid, &addr);
1515 if (ret) {
1516 pr_err("malformed ip/port passed: %s:%s\n",
1517 port->disc_addr.traddr, port->disc_addr.trsvcid);
1518 return ret;
1519 }
1520
1521 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1522 RDMA_PS_TCP, IB_QPT_RC);
1523 if (IS_ERR(cm_id)) {
1524 pr_err("CM ID creation failed\n");
1525 return PTR_ERR(cm_id);
1526 }
1527
1528 /*
1529 * Allow both IPv4 and IPv6 sockets to bind a single port
1530 * at the same time.
1531 */
1532 ret = rdma_set_afonly(cm_id, 1);
1533 if (ret) {
1534 pr_err("rdma_set_afonly failed (%d)\n", ret);
1535 goto out_destroy_id;
1536 }
1537
1538 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
1539 if (ret) {
1540 pr_err("binding CM ID to %pISpcs failed (%d)\n",
1541 (struct sockaddr *)&addr, ret);
1542 goto out_destroy_id;
1543 }
1544
1545 ret = rdma_listen(cm_id, 128);
1546 if (ret) {
1547 pr_err("listening to %pISpcs failed (%d)\n",
1548 (struct sockaddr *)&addr, ret);
1549 goto out_destroy_id;
1550 }
1551
1552 pr_info("enabling port %d (%pISpcs)\n",
1553 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
1554 port->priv = cm_id;
1555 return 0;
1556
1557 out_destroy_id:
1558 rdma_destroy_id(cm_id);
1559 return ret;
1560 }
1561
1562 static void nvmet_rdma_remove_port(struct nvmet_port *port)
1563 {
1564 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
1565
1566 if (cm_id)
1567 rdma_destroy_id(cm_id);
1568 }
1569
1570 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1571 struct nvmet_port *port, char *traddr)
1572 {
1573 struct rdma_cm_id *cm_id = port->priv;
1574
1575 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1576 struct nvmet_rdma_rsp *rsp =
1577 container_of(req, struct nvmet_rdma_rsp, req);
1578 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1579 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1580
1581 sprintf(traddr, "%pISc", addr);
1582 } else {
1583 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
1584 }
1585 }
1586
1587 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1588 .owner = THIS_MODULE,
1589 .type = NVMF_TRTYPE_RDMA,
1590 .msdbd = 1,
1591 .has_keyed_sgls = 1,
1592 .add_port = nvmet_rdma_add_port,
1593 .remove_port = nvmet_rdma_remove_port,
1594 .queue_response = nvmet_rdma_queue_response,
1595 .delete_ctrl = nvmet_rdma_delete_ctrl,
1596 .disc_traddr = nvmet_rdma_disc_port_addr,
1597 };
1598
1599 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1600 {
1601 struct nvmet_rdma_queue *queue, *tmp;
1602 struct nvmet_rdma_device *ndev;
1603 bool found = false;
1604
1605 mutex_lock(&device_list_mutex);
1606 list_for_each_entry(ndev, &device_list, entry) {
1607 if (ndev->device == ib_device) {
1608 found = true;
1609 break;
1610 }
1611 }
1612 mutex_unlock(&device_list_mutex);
1613
1614 if (!found)
1615 return;
1616
1617 /*
1618 * IB Device that is used by nvmet controllers is being removed,
1619 * delete all queues using this device.
1620 */
1621 mutex_lock(&nvmet_rdma_queue_mutex);
1622 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1623 queue_list) {
1624 if (queue->dev->device != ib_device)
1625 continue;
1626
1627 pr_info("Removing queue %d\n", queue->idx);
1628 list_del_init(&queue->queue_list);
1629 __nvmet_rdma_queue_disconnect(queue);
1630 }
1631 mutex_unlock(&nvmet_rdma_queue_mutex);
1632
1633 flush_scheduled_work();
1634 }
1635
1636 static struct ib_client nvmet_rdma_ib_client = {
1637 .name = "nvmet_rdma",
1638 .remove = nvmet_rdma_remove_one
1639 };
1640
1641 static int __init nvmet_rdma_init(void)
1642 {
1643 int ret;
1644
1645 ret = ib_register_client(&nvmet_rdma_ib_client);
1646 if (ret)
1647 return ret;
1648
1649 ret = nvmet_register_transport(&nvmet_rdma_ops);
1650 if (ret)
1651 goto err_ib_client;
1652
1653 nvmet_rdma_delete_wq = alloc_workqueue("nvmet-rdma-delete-wq",
1654 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
1655 if (!nvmet_rdma_delete_wq) {
1656 ret = -ENOMEM;
1657 goto err_unreg_transport;
1658 }
1659
1660 return 0;
1661
1662 err_unreg_transport:
1663 nvmet_unregister_transport(&nvmet_rdma_ops);
1664 err_ib_client:
1665 ib_unregister_client(&nvmet_rdma_ib_client);
1666 return ret;
1667 }
1668
1669 static void __exit nvmet_rdma_exit(void)
1670 {
1671 destroy_workqueue(nvmet_rdma_delete_wq);
1672 nvmet_unregister_transport(&nvmet_rdma_ops);
1673 ib_unregister_client(&nvmet_rdma_ib_client);
1674 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1675 ida_destroy(&nvmet_rdma_queue_ida);
1676 }
1677
1678 module_init(nvmet_rdma_init);
1679 module_exit(nvmet_rdma_exit);
1680
1681 MODULE_LICENSE("GPL v2");
1682 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */