drivers/nvme/target/rdma.c

   1 /*
   2  * NVMe over Fabrics RDMA target.
   3  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   4  *
   5  * This program is free software; you can redistribute it and/or modify it
   6  * under the terms and conditions of the GNU General Public License,
   7  * version 2, as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12  * more details.
  13  */
  14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15 #include <linux/atomic.h>
  16 #include <linux/ctype.h>
  17 #include <linux/delay.h>
  18 #include <linux/err.h>
  19 #include <linux/init.h>
  20 #include <linux/module.h>
  21 #include <linux/nvme.h>
  22 #include <linux/slab.h>
  23 #include <linux/string.h>
  24 #include <linux/wait.h>
  25 #include <linux/inet.h>
  26 #include <asm/unaligned.h>
  27
  28 #include <rdma/ib_verbs.h>
  29 #include <rdma/rdma_cm.h>
  30 #include <rdma/rw.h>
  31
  32 #include <linux/nvme-rdma.h>
  33 #include "nvmet.h"
  34
  35 /*
  36  * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  37  */
  38 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE     PAGE_SIZE
  39 #define NVMET_RDMA_MAX_INLINE_SGE               4
  40 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE         max_t(int, SZ_16K, PAGE_SIZE)
  41
  42 struct nvmet_rdma_cmd {
  43         struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
  44         struct ib_cqe           cqe;
  45         struct ib_recv_wr       wr;
  46         struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
  47         struct nvme_command     *nvme_cmd;
  48         struct nvmet_rdma_queue *queue;
  49 };
  50
  51 enum {
  52         NVMET_RDMA_REQ_INLINE_DATA      = (1 << 0),
  53         NVMET_RDMA_REQ_INVALIDATE_RKEY  = (1 << 1),
  54 };
  55
  56 struct nvmet_rdma_rsp {
  57         struct ib_sge           send_sge;
  58         struct ib_cqe           send_cqe;
  59         struct ib_send_wr       send_wr;
  60
  61         struct nvmet_rdma_cmd   *cmd;
  62         struct nvmet_rdma_queue *queue;
  63
  64         struct ib_cqe           read_cqe;
  65         struct rdma_rw_ctx      rw;
  66
  67         struct nvmet_req        req;
  68
  69         bool                    allocated;
  70         u8                      n_rdma;
  71         u32                     flags;
  72         u32                     invalidate_rkey;
  73
  74         struct list_head        wait_list;
  75         struct list_head        free_list;
  76 };
  77
  78 enum nvmet_rdma_queue_state {
  79         NVMET_RDMA_Q_CONNECTING,
  80         NVMET_RDMA_Q_LIVE,
  81         NVMET_RDMA_Q_DISCONNECTING,
  82 };
  83
  84 struct nvmet_rdma_queue {
  85         struct rdma_cm_id       *cm_id;
  86         struct nvmet_port       *port;
  87         struct ib_cq            *cq;
  88         atomic_t                sq_wr_avail;
  89         struct nvmet_rdma_device *dev;
  90         spinlock_t              state_lock;
  91         enum nvmet_rdma_queue_state state;
  92         struct nvmet_cq         nvme_cq;
  93         struct nvmet_sq         nvme_sq;
  94
  95         struct nvmet_rdma_rsp   *rsps;
  96         struct list_head        free_rsps;
  97         spinlock_t              rsps_lock;
  98         struct nvmet_rdma_cmd   *cmds;
  99
 100         struct work_struct      release_work;
 101         struct list_head        rsp_wait_list;
 102         struct list_head        rsp_wr_wait_list;
 103         spinlock_t              rsp_wr_wait_lock;
 104
 105         int                     idx;
 106         int                     host_qid;
 107         int                     recv_queue_size;
 108         int                     send_queue_size;
 109
 110         struct list_head        queue_list;
 111 };
 112
 113 struct nvmet_rdma_device {
 114         struct ib_device        *device;
 115         struct ib_pd            *pd;
 116         struct ib_srq           *srq;
 117         struct nvmet_rdma_cmd   *srq_cmds;
 118         size_t                  srq_size;
 119         struct kref             ref;
 120         struct list_head        entry;
 121         int                     inline_data_size;
 122         int                     inline_page_count;
 123 };
 124
 125 static struct workqueue_struct *nvmet_rdma_delete_wq;
 126 static bool nvmet_rdma_use_srq;
 127 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
 128 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
 129
 130 static DEFINE_IDA(nvmet_rdma_queue_ida);
 131 static LIST_HEAD(nvmet_rdma_queue_list);
 132 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
 133
 134 static LIST_HEAD(device_list);
 135 static DEFINE_MUTEX(device_list_mutex);
 136
 137 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
 138 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
 139 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 140 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
 141 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
 142 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
 143
 144 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 145
 146 static int num_pages(int len)
 147 {
 148         return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
 149 }
 150
 151 /* XXX: really should move to a generic header sooner or later.. */
 152 static inline u32 get_unaligned_le24(const u8 *p)
 153 {
 154         return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
 155 }
 156
 157 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
 158 {
 159         return nvme_is_write(rsp->req.cmd) &&
 160                 rsp->req.transfer_len &&
 161                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 162 }
 163
 164 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
 165 {
 166         return !nvme_is_write(rsp->req.cmd) &&
 167                 rsp->req.transfer_len &&
 168                 !rsp->req.rsp->status &&
 169                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 170 }
 171
 172 static inline struct nvmet_rdma_rsp *
 173 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
 174 {
 175         struct nvmet_rdma_rsp *rsp;
 176         unsigned long flags;
 177
 178         spin_lock_irqsave(&queue->rsps_lock, flags);
 179         rsp = list_first_entry_or_null(&queue->free_rsps,
 180                                 struct nvmet_rdma_rsp, free_list);
 181         if (likely(rsp))
 182                 list_del(&rsp->free_list);
 183         spin_unlock_irqrestore(&queue->rsps_lock, flags);
 184
 185         if (unlikely(!rsp)) {
 186                 rsp = kmalloc(sizeof(*rsp), GFP_KERNEL);
 187                 if (unlikely(!rsp))
 188                         return NULL;
 189                 rsp->allocated = true;
 190         }
 191
 192         return rsp;
 193 }
 194
 195 static inline void
 196 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 197 {
 198         unsigned long flags;
 199
 200         if (rsp->allocated) {
 201                 kfree(rsp);
 202                 return;
 203         }
 204
 205         spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
 206         list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
 207         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 208 }
 209
 210 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
 211                                 struct nvmet_rdma_cmd *c)
 212 {
 213         struct scatterlist *sg;
 214         struct ib_sge *sge;
 215         int i;
 216
 217         if (!ndev->inline_data_size)
 218                 return;
 219
 220         sg = c->inline_sg;
 221         sge = &c->sge[1];
 222
 223         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 224                 if (sge->length)
 225                         ib_dma_unmap_page(ndev->device, sge->addr,
 226                                         sge->length, DMA_FROM_DEVICE);
 227                 if (sg_page(sg))
 228                         __free_page(sg_page(sg));
 229         }
 230 }
 231
 232 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
 233                                 struct nvmet_rdma_cmd *c)
 234 {
 235         struct scatterlist *sg;
 236         struct ib_sge *sge;
 237         struct page *pg;
 238         int len;
 239         int i;
 240
 241         if (!ndev->inline_data_size)
 242                 return 0;
 243
 244         sg = c->inline_sg;
 245         sg_init_table(sg, ndev->inline_page_count);
 246         sge = &c->sge[1];
 247         len = ndev->inline_data_size;
 248
 249         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 250                 pg = alloc_page(GFP_KERNEL);
 251                 if (!pg)
 252                         goto out_err;
 253                 sg_assign_page(sg, pg);
 254                 sge->addr = ib_dma_map_page(ndev->device,
 255                         pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
 256                 if (ib_dma_mapping_error(ndev->device, sge->addr))
 257                         goto out_err;
 258                 sge->length = min_t(int, len, PAGE_SIZE);
 259                 sge->lkey = ndev->pd->local_dma_lkey;
 260                 len -= sge->length;
 261         }
 262
 263         return 0;
 264 out_err:
 265         for (; i >= 0; i--, sg--, sge--) {
 266                 if (sge->length)
 267                         ib_dma_unmap_page(ndev->device, sge->addr,
 268                                         sge->length, DMA_FROM_DEVICE);
 269                 if (sg_page(sg))
 270                         __free_page(sg_page(sg));
 271         }
 272         return -ENOMEM;
 273 }
 274
 275 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
 276                         struct nvmet_rdma_cmd *c, bool admin)
 277 {
 278         /* NVMe command / RDMA RECV */
 279         c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
 280         if (!c->nvme_cmd)
 281                 goto out;
 282
 283         c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
 284                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 285         if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
 286                 goto out_free_cmd;
 287
 288         c->sge[0].length = sizeof(*c->nvme_cmd);
 289         c->sge[0].lkey = ndev->pd->local_dma_lkey;
 290
 291         if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
 292                 goto out_unmap_cmd;
 293
 294         c->cqe.done = nvmet_rdma_recv_done;
 295
 296         c->wr.wr_cqe = &c->cqe;
 297         c->wr.sg_list = c->sge;
 298         c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
 299
 300         return 0;
 301
 302 out_unmap_cmd:
 303         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 304                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 305 out_free_cmd:
 306         kfree(c->nvme_cmd);
 307
 308 out:
 309         return -ENOMEM;
 310 }
 311
 312 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
 313                 struct nvmet_rdma_cmd *c, bool admin)
 314 {
 315         if (!admin)
 316                 nvmet_rdma_free_inline_pages(ndev, c);
 317         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 318                                 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 319         kfree(c->nvme_cmd);
 320 }
 321
 322 static struct nvmet_rdma_cmd *
 323 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
 324                 int nr_cmds, bool admin)
 325 {
 326         struct nvmet_rdma_cmd *cmds;
 327         int ret = -EINVAL, i;
 328
 329         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
 330         if (!cmds)
 331                 goto out;
 332
 333         for (i = 0; i < nr_cmds; i++) {
 334                 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
 335                 if (ret)
 336                         goto out_free;
 337         }
 338
 339         return cmds;
 340
 341 out_free:
 342         while (--i >= 0)
 343                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 344         kfree(cmds);
 345 out:
 346         return ERR_PTR(ret);
 347 }
 348
 349 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
 350                 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
 351 {
 352         int i;
 353
 354         for (i = 0; i < nr_cmds; i++)
 355                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 356         kfree(cmds);
 357 }
 358
 359 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 360                 struct nvmet_rdma_rsp *r)
 361 {
 362         /* NVMe CQE / RDMA SEND */
 363         r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
 364         if (!r->req.rsp)
 365                 goto out;
 366
 367         r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
 368                         sizeof(*r->req.rsp), DMA_TO_DEVICE);
 369         if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
 370                 goto out_free_rsp;
 371
 372         r->send_sge.length = sizeof(*r->req.rsp);
 373         r->send_sge.lkey = ndev->pd->local_dma_lkey;
 374
 375         r->send_cqe.done = nvmet_rdma_send_done;
 376
 377         r->send_wr.wr_cqe = &r->send_cqe;
 378         r->send_wr.sg_list = &r->send_sge;
 379         r->send_wr.num_sge = 1;
 380         r->send_wr.send_flags = IB_SEND_SIGNALED;
 381
 382         /* Data In / RDMA READ */
 383         r->read_cqe.done = nvmet_rdma_read_data_done;
 384         return 0;
 385
 386 out_free_rsp:
 387         kfree(r->req.rsp);
 388 out:
 389         return -ENOMEM;
 390 }
 391
 392 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 393                 struct nvmet_rdma_rsp *r)
 394 {
 395         ib_dma_unmap_single(ndev->device, r->send_sge.addr,
 396                                 sizeof(*r->req.rsp), DMA_TO_DEVICE);
 397         kfree(r->req.rsp);
 398 }
 399
 400 static int
 401 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
 402 {
 403         struct nvmet_rdma_device *ndev = queue->dev;
 404         int nr_rsps = queue->recv_queue_size * 2;
 405         int ret = -EINVAL, i;
 406
 407         queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
 408                         GFP_KERNEL);
 409         if (!queue->rsps)
 410                 goto out;
 411
 412         for (i = 0; i < nr_rsps; i++) {
 413                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 414
 415                 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
 416                 if (ret)
 417                         goto out_free;
 418
 419                 list_add_tail(&rsp->free_list, &queue->free_rsps);
 420         }
 421
 422         return 0;
 423
 424 out_free:
 425         while (--i >= 0) {
 426                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 427
 428                 list_del(&rsp->free_list);
 429                 nvmet_rdma_free_rsp(ndev, rsp);
 430         }
 431         kfree(queue->rsps);
 432 out:
 433         return ret;
 434 }
 435
 436 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
 437 {
 438         struct nvmet_rdma_device *ndev = queue->dev;
 439         int i, nr_rsps = queue->recv_queue_size * 2;
 440
 441         for (i = 0; i < nr_rsps; i++) {
 442                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 443
 444                 list_del(&rsp->free_list);
 445                 nvmet_rdma_free_rsp(ndev, rsp);
 446         }
 447         kfree(queue->rsps);
 448 }
 449
 450 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
 451                 struct nvmet_rdma_cmd *cmd)
 452 {
 453         int ret;
 454
 455         ib_dma_sync_single_for_device(ndev->device,
 456                 cmd->sge[0].addr, cmd->sge[0].length,
 457                 DMA_FROM_DEVICE);
 458
 459         if (ndev->srq)
 460                 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
 461         else
 462                 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
 463
 464         if (unlikely(ret))
 465                 pr_err("post_recv cmd failed\n");
 466
 467         return ret;
 468 }
 469
 470 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
 471 {
 472         spin_lock(&queue->rsp_wr_wait_lock);
 473         while (!list_empty(&queue->rsp_wr_wait_list)) {
 474                 struct nvmet_rdma_rsp *rsp;
 475                 bool ret;
 476
 477                 rsp = list_entry(queue->rsp_wr_wait_list.next,
 478                                 struct nvmet_rdma_rsp, wait_list);
 479                 list_del(&rsp->wait_list);
 480
 481                 spin_unlock(&queue->rsp_wr_wait_lock);
 482                 ret = nvmet_rdma_execute_command(rsp);
 483                 spin_lock(&queue->rsp_wr_wait_lock);
 484
 485                 if (!ret) {
 486                         list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
 487                         break;
 488                 }
 489         }
 490         spin_unlock(&queue->rsp_wr_wait_lock);
 491 }
 492
 493
 494 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
 495 {
 496         struct nvmet_rdma_queue *queue = rsp->queue;
 497
 498         atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 499
 500         if (rsp->n_rdma) {
 501                 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
 502                                 queue->cm_id->port_num, rsp->req.sg,
 503                                 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 504         }
 505
 506         if (rsp->req.sg != rsp->cmd->inline_sg)
 507                 sgl_free(rsp->req.sg);
 508
 509         if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
 510                 nvmet_rdma_process_wr_wait_list(queue);
 511
 512         nvmet_rdma_put_rsp(rsp);
 513 }
 514
 515 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
 516 {
 517         if (queue->nvme_sq.ctrl) {
 518                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
 519         } else {
 520                 /*
 521                  * we didn't setup the controller yet in case
 522                  * of admin connect error, just disconnect and
 523                  * cleanup the queue
 524                  */
 525                 nvmet_rdma_queue_disconnect(queue);
 526         }
 527 }
 528
 529 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 530 {
 531         struct nvmet_rdma_rsp *rsp =
 532                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
 533
 534         nvmet_rdma_release_rsp(rsp);
 535
 536         if (unlikely(wc->status != IB_WC_SUCCESS &&
 537                      wc->status != IB_WC_WR_FLUSH_ERR)) {
 538                 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
 539                         wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 540                 nvmet_rdma_error_comp(rsp->queue);
 541         }
 542 }
 543
 544 static void nvmet_rdma_queue_response(struct nvmet_req *req)
 545 {
 546         struct nvmet_rdma_rsp *rsp =
 547                 container_of(req, struct nvmet_rdma_rsp, req);
 548         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 549         struct ib_send_wr *first_wr;
 550
 551         if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
 552                 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
 553                 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
 554         } else {
 555                 rsp->send_wr.opcode = IB_WR_SEND;
 556         }
 557
 558         if (nvmet_rdma_need_data_out(rsp))
 559                 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
 560                                 cm_id->port_num, NULL, &rsp->send_wr);
 561         else
 562                 first_wr = &rsp->send_wr;
 563
 564         nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
 565
 566         ib_dma_sync_single_for_device(rsp->queue->dev->device,
 567                 rsp->send_sge.addr, rsp->send_sge.length,
 568                 DMA_TO_DEVICE);
 569
 570         if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
 571                 pr_err("sending cmd response failed\n");
 572                 nvmet_rdma_release_rsp(rsp);
 573         }
 574 }
 575
 576 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 577 {
 578         struct nvmet_rdma_rsp *rsp =
 579                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
 580         struct nvmet_rdma_queue *queue = cq->cq_context;
 581
 582         WARN_ON(rsp->n_rdma <= 0);
 583         atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
 584         rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
 585                         queue->cm_id->port_num, rsp->req.sg,
 586                         rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 587         rsp->n_rdma = 0;
 588
 589         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 590                 nvmet_req_uninit(&rsp->req);
 591                 nvmet_rdma_release_rsp(rsp);
 592                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 593                         pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
 594                                 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 595                         nvmet_rdma_error_comp(queue);
 596                 }
 597                 return;
 598         }
 599
 600         nvmet_req_execute(&rsp->req);
 601 }
 602
 603 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
 604                 u64 off)
 605 {
 606         int sg_count = num_pages(len);
 607         struct scatterlist *sg;
 608         int i;
 609
 610         sg = rsp->cmd->inline_sg;
 611         for (i = 0; i < sg_count; i++, sg++) {
 612                 if (i < sg_count - 1)
 613                         sg_unmark_end(sg);
 614                 else
 615                         sg_mark_end(sg);
 616                 sg->offset = off;
 617                 sg->length = min_t(int, len, PAGE_SIZE - off);
 618                 len -= sg->length;
 619                 if (!i)
 620                         off = 0;
 621         }
 622
 623         rsp->req.sg = rsp->cmd->inline_sg;
 624         rsp->req.sg_cnt = sg_count;
 625 }
 626
 627 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 628 {
 629         struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
 630         u64 off = le64_to_cpu(sgl->addr);
 631         u32 len = le32_to_cpu(sgl->length);
 632
 633         if (!nvme_is_write(rsp->req.cmd))
 634                 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 635
 636         if (off + len > rsp->queue->dev->inline_data_size) {
 637                 pr_err("invalid inline data offset!\n");
 638                 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
 639         }
 640
 641         /* no data command? */
 642         if (!len)
 643                 return 0;
 644
 645         nvmet_rdma_use_inline_sg(rsp, len, off);
 646         rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
 647         rsp->req.transfer_len += len;
 648         return 0;
 649 }
 650
 651 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 652                 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
 653 {
 654         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 655         u64 addr = le64_to_cpu(sgl->addr);
 656         u32 len = get_unaligned_le24(sgl->length);
 657         u32 key = get_unaligned_le32(sgl->key);
 658         int ret;
 659
 660         /* no data command? */
 661         if (!len)
 662                 return 0;
 663
 664         rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
 665         if (!rsp->req.sg)
 666                 return NVME_SC_INTERNAL;
 667
 668         ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
 669                         rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
 670                         nvmet_data_dir(&rsp->req));
 671         if (ret < 0)
 672                 return NVME_SC_INTERNAL;
 673         rsp->req.transfer_len += len;
 674         rsp->n_rdma += ret;
 675
 676         if (invalidate) {
 677                 rsp->invalidate_rkey = key;
 678                 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
 679         }
 680
 681         return 0;
 682 }
 683
 684 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 685 {
 686         struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
 687
 688         switch (sgl->type >> 4) {
 689         case NVME_SGL_FMT_DATA_DESC:
 690                 switch (sgl->type & 0xf) {
 691                 case NVME_SGL_FMT_OFFSET:
 692                         return nvmet_rdma_map_sgl_inline(rsp);
 693                 default:
 694                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 695                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 696                 }
 697         case NVME_KEY_SGL_FMT_DATA_DESC:
 698                 switch (sgl->type & 0xf) {
 699                 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
 700                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
 701                 case NVME_SGL_FMT_ADDRESS:
 702                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
 703                 default:
 704                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 705                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 706                 }
 707         default:
 708                 pr_err("invalid SGL type: %#x\n", sgl->type);
 709                 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
 710         }
 711 }
 712
 713 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
 714 {
 715         struct nvmet_rdma_queue *queue = rsp->queue;
 716
 717         if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
 718                         &queue->sq_wr_avail) < 0)) {
 719                 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
 720                                 1 + rsp->n_rdma, queue->idx,
 721                                 queue->nvme_sq.ctrl->cntlid);
 722                 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 723                 return false;
 724         }
 725
 726         if (nvmet_rdma_need_data_in(rsp)) {
 727                 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
 728                                 queue->cm_id->port_num, &rsp->read_cqe, NULL))
 729                         nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
 730         } else {
 731                 nvmet_req_execute(&rsp->req);
 732         }
 733
 734         return true;
 735 }
 736
 737 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
 738                 struct nvmet_rdma_rsp *cmd)
 739 {
 740         u16 status;
 741
 742         ib_dma_sync_single_for_cpu(queue->dev->device,
 743                 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
 744                 DMA_FROM_DEVICE);
 745         ib_dma_sync_single_for_cpu(queue->dev->device,
 746                 cmd->send_sge.addr, cmd->send_sge.length,
 747                 DMA_TO_DEVICE);
 748
 749         if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
 750                         &queue->nvme_sq, &nvmet_rdma_ops))
 751                 return;
 752
 753         status = nvmet_rdma_map_sgl(cmd);
 754         if (status)
 755                 goto out_err;
 756
 757         if (unlikely(!nvmet_rdma_execute_command(cmd))) {
 758                 spin_lock(&queue->rsp_wr_wait_lock);
 759                 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
 760                 spin_unlock(&queue->rsp_wr_wait_lock);
 761         }
 762
 763         return;
 764
 765 out_err:
 766         nvmet_req_complete(&cmd->req, status);
 767 }
 768
 769 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 770 {
 771         struct nvmet_rdma_cmd *cmd =
 772                 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
 773         struct nvmet_rdma_queue *queue = cq->cq_context;
 774         struct nvmet_rdma_rsp *rsp;
 775
 776         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 777                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 778                         pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
 779                                 wc->wr_cqe, ib_wc_status_msg(wc->status),
 780                                 wc->status);
 781                         nvmet_rdma_error_comp(queue);
 782                 }
 783                 return;
 784         }
 785
 786         if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
 787                 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
 788                 nvmet_rdma_error_comp(queue);
 789                 return;
 790         }
 791
 792         cmd->queue = queue;
 793         rsp = nvmet_rdma_get_rsp(queue);
 794         if (unlikely(!rsp)) {
 795                 /*
 796                  * we get here only under memory pressure,
 797                  * silently drop and have the host retry
 798                  * as we can't even fail it.
 799                  */
 800                 nvmet_rdma_post_recv(queue->dev, cmd);
 801                 return;
 802         }
 803         rsp->queue = queue;
 804         rsp->cmd = cmd;
 805         rsp->flags = 0;
 806         rsp->req.cmd = cmd->nvme_cmd;
 807         rsp->req.port = queue->port;
 808         rsp->n_rdma = 0;
 809
 810         if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
 811                 unsigned long flags;
 812
 813                 spin_lock_irqsave(&queue->state_lock, flags);
 814                 if (queue->state == NVMET_RDMA_Q_CONNECTING)
 815                         list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
 816                 else
 817                         nvmet_rdma_put_rsp(rsp);
 818                 spin_unlock_irqrestore(&queue->state_lock, flags);
 819                 return;
 820         }
 821
 822         nvmet_rdma_handle_command(queue, rsp);
 823 }
 824
 825 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
 826 {
 827         if (!ndev->srq)
 828                 return;
 829
 830         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 831         ib_destroy_srq(ndev->srq);
 832 }
 833
 834 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
 835 {
 836         struct ib_srq_init_attr srq_attr = { NULL, };
 837         struct ib_srq *srq;
 838         size_t srq_size;
 839         int ret, i;
 840
 841         srq_size = 4095;        /* XXX: tune */
 842
 843         srq_attr.attr.max_wr = srq_size;
 844         srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
 845         srq_attr.attr.srq_limit = 0;
 846         srq_attr.srq_type = IB_SRQT_BASIC;
 847         srq = ib_create_srq(ndev->pd, &srq_attr);
 848         if (IS_ERR(srq)) {
 849                 /*
 850                  * If SRQs aren't supported we just go ahead and use normal
 851                  * non-shared receive queues.
 852                  */
 853                 pr_info("SRQ requested but not supported.\n");
 854                 return 0;
 855         }
 856
 857         ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
 858         if (IS_ERR(ndev->srq_cmds)) {
 859                 ret = PTR_ERR(ndev->srq_cmds);
 860                 goto out_destroy_srq;
 861         }
 862
 863         ndev->srq = srq;
 864         ndev->srq_size = srq_size;
 865
 866         for (i = 0; i < srq_size; i++) {
 867                 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
 868                 if (ret)
 869                         goto out_free_cmds;
 870         }
 871
 872         return 0;
 873
 874 out_free_cmds:
 875         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 876 out_destroy_srq:
 877         ib_destroy_srq(srq);
 878         return ret;
 879 }
 880
 881 static void nvmet_rdma_free_dev(struct kref *ref)
 882 {
 883         struct nvmet_rdma_device *ndev =
 884                 container_of(ref, struct nvmet_rdma_device, ref);
 885
 886         mutex_lock(&device_list_mutex);
 887         list_del(&ndev->entry);
 888         mutex_unlock(&device_list_mutex);
 889
 890         nvmet_rdma_destroy_srq(ndev);
 891         ib_dealloc_pd(ndev->pd);
 892
 893         kfree(ndev);
 894 }
 895
 896 static struct nvmet_rdma_device *
 897 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 898 {
 899         struct nvmet_port *port = cm_id->context;
 900         struct nvmet_rdma_device *ndev;
 901         int inline_page_count;
 902         int inline_sge_count;
 903         int ret;
 904
 905         mutex_lock(&device_list_mutex);
 906         list_for_each_entry(ndev, &device_list, entry) {
 907                 if (ndev->device->node_guid == cm_id->device->node_guid &&
 908                     kref_get_unless_zero(&ndev->ref))
 909                         goto out_unlock;
 910         }
 911
 912         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 913         if (!ndev)
 914                 goto out_err;
 915
 916         inline_page_count = num_pages(port->inline_data_size);
 917         inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
 918                                 cm_id->device->attrs.max_recv_sge) - 1;
 919         if (inline_page_count > inline_sge_count) {
 920                 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
 921                         port->inline_data_size, cm_id->device->name,
 922                         inline_sge_count * PAGE_SIZE);
 923                 port->inline_data_size = inline_sge_count * PAGE_SIZE;
 924                 inline_page_count = inline_sge_count;
 925         }
 926         ndev->inline_data_size = port->inline_data_size;
 927         ndev->inline_page_count = inline_page_count;
 928         ndev->device = cm_id->device;
 929         kref_init(&ndev->ref);
 930
 931         ndev->pd = ib_alloc_pd(ndev->device, 0);
 932         if (IS_ERR(ndev->pd))
 933                 goto out_free_dev;
 934
 935         if (nvmet_rdma_use_srq) {
 936                 ret = nvmet_rdma_init_srq(ndev);
 937                 if (ret)
 938                         goto out_free_pd;
 939         }
 940
 941         list_add(&ndev->entry, &device_list);
 942 out_unlock:
 943         mutex_unlock(&device_list_mutex);
 944         pr_debug("added %s.\n", ndev->device->name);
 945         return ndev;
 946
 947 out_free_pd:
 948         ib_dealloc_pd(ndev->pd);
 949 out_free_dev:
 950         kfree(ndev);
 951 out_err:
 952         mutex_unlock(&device_list_mutex);
 953         return NULL;
 954 }
 955
 956 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
 957 {
 958         struct ib_qp_init_attr qp_attr;
 959         struct nvmet_rdma_device *ndev = queue->dev;
 960         int comp_vector, nr_cqe, ret, i;
 961
 962         /*
 963          * Spread the io queues across completion vectors,
 964          * but still keep all admin queues on vector 0.
 965          */
 966         comp_vector = !queue->host_qid ? 0 :
 967                 queue->idx % ndev->device->num_comp_vectors;
 968
 969         /*
 970          * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
 971          */
 972         nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
 973
 974         queue->cq = ib_alloc_cq(ndev->device, queue,
 975                         nr_cqe + 1, comp_vector,
 976                         IB_POLL_WORKQUEUE);
 977         if (IS_ERR(queue->cq)) {
 978                 ret = PTR_ERR(queue->cq);
 979                 pr_err("failed to create CQ cqe= %d ret= %d\n",
 980                        nr_cqe + 1, ret);
 981                 goto out;
 982         }
 983
 984         memset(&qp_attr, 0, sizeof(qp_attr));
 985         qp_attr.qp_context = queue;
 986         qp_attr.event_handler = nvmet_rdma_qp_event;
 987         qp_attr.send_cq = queue->cq;
 988         qp_attr.recv_cq = queue->cq;
 989         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 990         qp_attr.qp_type = IB_QPT_RC;
 991         /* +1 for drain */
 992         qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
 993         qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
 994         qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
 995                                         ndev->device->attrs.max_send_sge);
 996
 997         if (ndev->srq) {
 998                 qp_attr.srq = ndev->srq;
 999         } else {
1000                 /* +1 for drain */
1001                 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1002                 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1003         }
1004
1005         ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1006         if (ret) {
1007                 pr_err("failed to create_qp ret= %d\n", ret);
1008                 goto err_destroy_cq;
1009         }
1010
1011         atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1012
1013         pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1014                  __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1015                  qp_attr.cap.max_send_wr, queue->cm_id);
1016
1017         if (!ndev->srq) {
1018                 for (i = 0; i < queue->recv_queue_size; i++) {
1019                         queue->cmds[i].queue = queue;
1020                         ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1021                         if (ret)
1022                                 goto err_destroy_qp;
1023                 }
1024         }
1025
1026 out:
1027         return ret;
1028
1029 err_destroy_qp:
1030         rdma_destroy_qp(queue->cm_id);
1031 err_destroy_cq:
1032         ib_free_cq(queue->cq);
1033         goto out;
1034 }
1035
1036 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1037 {
1038         struct ib_qp *qp = queue->cm_id->qp;
1039
1040         ib_drain_qp(qp);
1041         rdma_destroy_id(queue->cm_id);
1042         ib_destroy_qp(qp);
1043         ib_free_cq(queue->cq);
1044 }
1045
1046 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1047 {
1048         pr_debug("freeing queue %d\n", queue->idx);
1049
1050         nvmet_sq_destroy(&queue->nvme_sq);
1051
1052         nvmet_rdma_destroy_queue_ib(queue);
1053         if (!queue->dev->srq) {
1054                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1055                                 queue->recv_queue_size,
1056                                 !queue->host_qid);
1057         }
1058         nvmet_rdma_free_rsps(queue);
1059         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1060         kfree(queue);
1061 }
1062
1063 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1064 {
1065         struct nvmet_rdma_queue *queue =
1066                 container_of(w, struct nvmet_rdma_queue, release_work);
1067         struct nvmet_rdma_device *dev = queue->dev;
1068
1069         nvmet_rdma_free_queue(queue);
1070
1071         kref_put(&dev->ref, nvmet_rdma_free_dev);
1072 }
1073
1074 static int
1075 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1076                                 struct nvmet_rdma_queue *queue)
1077 {
1078         struct nvme_rdma_cm_req *req;
1079
1080         req = (struct nvme_rdma_cm_req *)conn->private_data;
1081         if (!req || conn->private_data_len == 0)
1082                 return NVME_RDMA_CM_INVALID_LEN;
1083
1084         if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1085                 return NVME_RDMA_CM_INVALID_RECFMT;
1086
1087         queue->host_qid = le16_to_cpu(req->qid);
1088
1089         /*
1090          * req->hsqsize corresponds to our recv queue size plus 1
1091          * req->hrqsize corresponds to our send queue size
1092          */
1093         queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1094         queue->send_queue_size = le16_to_cpu(req->hrqsize);
1095
1096         if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1097                 return NVME_RDMA_CM_INVALID_HSQSIZE;
1098
1099         /* XXX: Should we enforce some kind of max for IO queues? */
1100
1101         return 0;
1102 }
1103
1104 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1105                                 enum nvme_rdma_cm_status status)
1106 {
1107         struct nvme_rdma_cm_rej rej;
1108
1109         pr_debug("rejecting connect request: status %d (%s)\n",
1110                  status, nvme_rdma_cm_msg(status));
1111
1112         rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1113         rej.sts = cpu_to_le16(status);
1114
1115         return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1116 }
1117
1118 static struct nvmet_rdma_queue *
1119 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1120                 struct rdma_cm_id *cm_id,
1121                 struct rdma_cm_event *event)
1122 {
1123         struct nvmet_rdma_queue *queue;
1124         int ret;
1125
1126         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1127         if (!queue) {
1128                 ret = NVME_RDMA_CM_NO_RSC;
1129                 goto out_reject;
1130         }
1131
1132         ret = nvmet_sq_init(&queue->nvme_sq);
1133         if (ret) {
1134                 ret = NVME_RDMA_CM_NO_RSC;
1135                 goto out_free_queue;
1136         }
1137
1138         ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1139         if (ret)
1140                 goto out_destroy_sq;
1141
1142         /*
1143          * Schedules the actual release because calling rdma_destroy_id from
1144          * inside a CM callback would trigger a deadlock. (great API design..)
1145          */
1146         INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1147         queue->dev = ndev;
1148         queue->cm_id = cm_id;
1149
1150         spin_lock_init(&queue->state_lock);
1151         queue->state = NVMET_RDMA_Q_CONNECTING;
1152         INIT_LIST_HEAD(&queue->rsp_wait_list);
1153         INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1154         spin_lock_init(&queue->rsp_wr_wait_lock);
1155         INIT_LIST_HEAD(&queue->free_rsps);
1156         spin_lock_init(&queue->rsps_lock);
1157         INIT_LIST_HEAD(&queue->queue_list);
1158
1159         queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1160         if (queue->idx < 0) {
1161                 ret = NVME_RDMA_CM_NO_RSC;
1162                 goto out_destroy_sq;
1163         }
1164
1165         ret = nvmet_rdma_alloc_rsps(queue);
1166         if (ret) {
1167                 ret = NVME_RDMA_CM_NO_RSC;
1168                 goto out_ida_remove;
1169         }
1170
1171         if (!ndev->srq) {
1172                 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1173                                 queue->recv_queue_size,
1174                                 !queue->host_qid);
1175                 if (IS_ERR(queue->cmds)) {
1176                         ret = NVME_RDMA_CM_NO_RSC;
1177                         goto out_free_responses;
1178                 }
1179         }
1180
1181         ret = nvmet_rdma_create_queue_ib(queue);
1182         if (ret) {
1183                 pr_err("%s: creating RDMA queue failed (%d).\n",
1184                         __func__, ret);
1185                 ret = NVME_RDMA_CM_NO_RSC;
1186                 goto out_free_cmds;
1187         }
1188
1189         return queue;
1190
1191 out_free_cmds:
1192         if (!ndev->srq) {
1193                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1194                                 queue->recv_queue_size,
1195                                 !queue->host_qid);
1196         }
1197 out_free_responses:
1198         nvmet_rdma_free_rsps(queue);
1199 out_ida_remove:
1200         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1201 out_destroy_sq:
1202         nvmet_sq_destroy(&queue->nvme_sq);
1203 out_free_queue:
1204         kfree(queue);
1205 out_reject:
1206         nvmet_rdma_cm_reject(cm_id, ret);
1207         return NULL;
1208 }
1209
1210 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1211 {
1212         struct nvmet_rdma_queue *queue = priv;
1213
1214         switch (event->event) {
1215         case IB_EVENT_COMM_EST:
1216                 rdma_notify(queue->cm_id, event->event);
1217                 break;
1218         default:
1219                 pr_err("received IB QP event: %s (%d)\n",
1220                        ib_event_msg(event->event), event->event);
1221                 break;
1222         }
1223 }
1224
1225 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1226                 struct nvmet_rdma_queue *queue,
1227                 struct rdma_conn_param *p)
1228 {
1229         struct rdma_conn_param  param = { };
1230         struct nvme_rdma_cm_rep priv = { };
1231         int ret = -ENOMEM;
1232
1233         param.rnr_retry_count = 7;
1234         param.flow_control = 1;
1235         param.initiator_depth = min_t(u8, p->initiator_depth,
1236                 queue->dev->device->attrs.max_qp_init_rd_atom);
1237         param.private_data = &priv;
1238         param.private_data_len = sizeof(priv);
1239         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1240         priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1241
1242         ret = rdma_accept(cm_id, &param);
1243         if (ret)
1244                 pr_err("rdma_accept failed (error code = %d)\n", ret);
1245
1246         return ret;
1247 }
1248
1249 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1250                 struct rdma_cm_event *event)
1251 {
1252         struct nvmet_rdma_device *ndev;
1253         struct nvmet_rdma_queue *queue;
1254         int ret = -EINVAL;
1255
1256         ndev = nvmet_rdma_find_get_device(cm_id);
1257         if (!ndev) {
1258                 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1259                 return -ECONNREFUSED;
1260         }
1261
1262         queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1263         if (!queue) {
1264                 ret = -ENOMEM;
1265                 goto put_device;
1266         }
1267         queue->port = cm_id->context;
1268
1269         if (queue->host_qid == 0) {
1270                 /* Let inflight controller teardown complete */
1271                 flush_workqueue(nvmet_rdma_delete_wq);
1272         }
1273
1274         ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1275         if (ret) {
1276                 queue_work(nvmet_rdma_delete_wq, &queue->release_work);
1277                 /* Destroying rdma_cm id is not needed here */
1278                 return 0;
1279         }
1280
1281         mutex_lock(&nvmet_rdma_queue_mutex);
1282         list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1283         mutex_unlock(&nvmet_rdma_queue_mutex);
1284
1285         return 0;
1286
1287 put_device:
1288         kref_put(&ndev->ref, nvmet_rdma_free_dev);
1289
1290         return ret;
1291 }
1292
1293 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1294 {
1295         unsigned long flags;
1296
1297         spin_lock_irqsave(&queue->state_lock, flags);
1298         if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1299                 pr_warn("trying to establish a connected queue\n");
1300                 goto out_unlock;
1301         }
1302         queue->state = NVMET_RDMA_Q_LIVE;
1303
1304         while (!list_empty(&queue->rsp_wait_list)) {
1305                 struct nvmet_rdma_rsp *cmd;
1306
1307                 cmd = list_first_entry(&queue->rsp_wait_list,
1308                                         struct nvmet_rdma_rsp, wait_list);
1309                 list_del(&cmd->wait_list);
1310
1311                 spin_unlock_irqrestore(&queue->state_lock, flags);
1312                 nvmet_rdma_handle_command(queue, cmd);
1313                 spin_lock_irqsave(&queue->state_lock, flags);
1314         }
1315
1316 out_unlock:
1317         spin_unlock_irqrestore(&queue->state_lock, flags);
1318 }
1319
1320 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1321 {
1322         bool disconnect = false;
1323         unsigned long flags;
1324
1325         pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1326
1327         spin_lock_irqsave(&queue->state_lock, flags);
1328         switch (queue->state) {
1329         case NVMET_RDMA_Q_CONNECTING:
1330         case NVMET_RDMA_Q_LIVE:
1331                 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1332                 disconnect = true;
1333                 break;
1334         case NVMET_RDMA_Q_DISCONNECTING:
1335                 break;
1336         }
1337         spin_unlock_irqrestore(&queue->state_lock, flags);
1338
1339         if (disconnect) {
1340                 rdma_disconnect(queue->cm_id);
1341                 queue_work(nvmet_rdma_delete_wq, &queue->release_work);
1342         }
1343 }
1344
1345 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1346 {
1347         bool disconnect = false;
1348
1349         mutex_lock(&nvmet_rdma_queue_mutex);
1350         if (!list_empty(&queue->queue_list)) {
1351                 list_del_init(&queue->queue_list);
1352                 disconnect = true;
1353         }
1354         mutex_unlock(&nvmet_rdma_queue_mutex);
1355
1356         if (disconnect)
1357                 __nvmet_rdma_queue_disconnect(queue);
1358 }
1359
1360 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1361                 struct nvmet_rdma_queue *queue)
1362 {
1363         WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1364
1365         mutex_lock(&nvmet_rdma_queue_mutex);
1366         if (!list_empty(&queue->queue_list))
1367                 list_del_init(&queue->queue_list);
1368         mutex_unlock(&nvmet_rdma_queue_mutex);
1369
1370         pr_err("failed to connect queue %d\n", queue->idx);
1371         queue_work(nvmet_rdma_delete_wq, &queue->release_work);
1372 }
1373
1374 /**
1375  * nvme_rdma_device_removal() - Handle RDMA device removal
1376  * @cm_id:      rdma_cm id, used for nvmet port
1377  * @queue:      nvmet rdma queue (cm id qp_context)
1378  *
1379  * DEVICE_REMOVAL event notifies us that the RDMA device is about
1380  * to unplug. Note that this event can be generated on a normal
1381  * queue cm_id and/or a device bound listener cm_id (where in this
1382  * case queue will be null).
1383  *
1384  * We registered an ib_client to handle device removal for queues,
1385  * so we only need to handle the listening port cm_ids. In this case
1386  * we nullify the priv to prevent double cm_id destruction and destroying
1387  * the cm_id implicitely by returning a non-zero rc to the callout.
1388  */
1389 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1390                 struct nvmet_rdma_queue *queue)
1391 {
1392         struct nvmet_port *port;
1393
1394         if (queue) {
1395                 /*
1396                  * This is a queue cm_id. we have registered
1397                  * an ib_client to handle queues removal
1398                  * so don't interfear and just return.
1399                  */
1400                 return 0;
1401         }
1402
1403         port = cm_id->context;
1404
1405         /*
1406          * This is a listener cm_id. Make sure that
1407          * future remove_port won't invoke a double
1408          * cm_id destroy. use atomic xchg to make sure
1409          * we don't compete with remove_port.
1410          */
1411         if (xchg(&port->priv, NULL) != cm_id)
1412                 return 0;
1413
1414         /*
1415          * We need to return 1 so that the core will destroy
1416          * it's own ID.  What a great API design..
1417          */
1418         return 1;
1419 }
1420
1421 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1422                 struct rdma_cm_event *event)
1423 {
1424         struct nvmet_rdma_queue *queue = NULL;
1425         int ret = 0;
1426
1427         if (cm_id->qp)
1428                 queue = cm_id->qp->qp_context;
1429
1430         pr_debug("%s (%d): status %d id %p\n",
1431                 rdma_event_msg(event->event), event->event,
1432                 event->status, cm_id);
1433
1434         switch (event->event) {
1435         case RDMA_CM_EVENT_CONNECT_REQUEST:
1436                 ret = nvmet_rdma_queue_connect(cm_id, event);
1437                 break;
1438         case RDMA_CM_EVENT_ESTABLISHED:
1439                 nvmet_rdma_queue_established(queue);
1440                 break;
1441         case RDMA_CM_EVENT_ADDR_CHANGE:
1442         case RDMA_CM_EVENT_DISCONNECTED:
1443         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1444                 nvmet_rdma_queue_disconnect(queue);
1445                 break;
1446         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1447                 ret = nvmet_rdma_device_removal(cm_id, queue);
1448                 break;
1449         case RDMA_CM_EVENT_REJECTED:
1450                 pr_debug("Connection rejected: %s\n",
1451                          rdma_reject_msg(cm_id, event->status));
1452                 /* FALLTHROUGH */
1453         case RDMA_CM_EVENT_UNREACHABLE:
1454         case RDMA_CM_EVENT_CONNECT_ERROR:
1455                 nvmet_rdma_queue_connect_fail(cm_id, queue);
1456                 break;
1457         default:
1458                 pr_err("received unrecognized RDMA CM event %d\n",
1459                         event->event);
1460                 break;
1461         }
1462
1463         return ret;
1464 }
1465
1466 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1467 {
1468         struct nvmet_rdma_queue *queue;
1469
1470 restart:
1471         mutex_lock(&nvmet_rdma_queue_mutex);
1472         list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1473                 if (queue->nvme_sq.ctrl == ctrl) {
1474                         list_del_init(&queue->queue_list);
1475                         mutex_unlock(&nvmet_rdma_queue_mutex);
1476
1477                         __nvmet_rdma_queue_disconnect(queue);
1478                         goto restart;
1479                 }
1480         }
1481         mutex_unlock(&nvmet_rdma_queue_mutex);
1482 }
1483
1484 static int nvmet_rdma_add_port(struct nvmet_port *port)
1485 {
1486         struct rdma_cm_id *cm_id;
1487         struct sockaddr_storage addr = { };
1488         __kernel_sa_family_t af;
1489         int ret;
1490
1491         switch (port->disc_addr.adrfam) {
1492         case NVMF_ADDR_FAMILY_IP4:
1493                 af = AF_INET;
1494                 break;
1495         case NVMF_ADDR_FAMILY_IP6:
1496                 af = AF_INET6;
1497                 break;
1498         default:
1499                 pr_err("address family %d not supported\n",
1500                                 port->disc_addr.adrfam);
1501                 return -EINVAL;
1502         }
1503
1504         if (port->inline_data_size < 0) {
1505                 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1506         } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1507                 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1508                         port->inline_data_size,
1509                         NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1510                 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1511         }
1512
1513         ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
1514                         port->disc_addr.trsvcid, &addr);
1515         if (ret) {
1516                 pr_err("malformed ip/port passed: %s:%s\n",
1517                         port->disc_addr.traddr, port->disc_addr.trsvcid);
1518                 return ret;
1519         }
1520
1521         cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1522                         RDMA_PS_TCP, IB_QPT_RC);
1523         if (IS_ERR(cm_id)) {
1524                 pr_err("CM ID creation failed\n");
1525                 return PTR_ERR(cm_id);
1526         }
1527
1528         /*
1529          * Allow both IPv4 and IPv6 sockets to bind a single port
1530          * at the same time.
1531          */
1532         ret = rdma_set_afonly(cm_id, 1);
1533         if (ret) {
1534                 pr_err("rdma_set_afonly failed (%d)\n", ret);
1535                 goto out_destroy_id;
1536         }
1537
1538         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
1539         if (ret) {
1540                 pr_err("binding CM ID to %pISpcs failed (%d)\n",
1541                         (struct sockaddr *)&addr, ret);
1542                 goto out_destroy_id;
1543         }
1544
1545         ret = rdma_listen(cm_id, 128);
1546         if (ret) {
1547                 pr_err("listening to %pISpcs failed (%d)\n",
1548                         (struct sockaddr *)&addr, ret);
1549                 goto out_destroy_id;
1550         }
1551
1552         pr_info("enabling port %d (%pISpcs)\n",
1553                 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
1554         port->priv = cm_id;
1555         return 0;
1556
1557 out_destroy_id:
1558         rdma_destroy_id(cm_id);
1559         return ret;
1560 }
1561
1562 static void nvmet_rdma_remove_port(struct nvmet_port *port)
1563 {
1564         struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
1565
1566         if (cm_id)
1567                 rdma_destroy_id(cm_id);
1568 }
1569
1570 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1571                 struct nvmet_port *port, char *traddr)
1572 {
1573         struct rdma_cm_id *cm_id = port->priv;
1574
1575         if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1576                 struct nvmet_rdma_rsp *rsp =
1577                         container_of(req, struct nvmet_rdma_rsp, req);
1578                 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1579                 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1580
1581                 sprintf(traddr, "%pISc", addr);
1582         } else {
1583                 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
1584         }
1585 }
1586
1587 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1588         .owner                  = THIS_MODULE,
1589         .type                   = NVMF_TRTYPE_RDMA,
1590         .msdbd                  = 1,
1591         .has_keyed_sgls         = 1,
1592         .add_port               = nvmet_rdma_add_port,
1593         .remove_port            = nvmet_rdma_remove_port,
1594         .queue_response         = nvmet_rdma_queue_response,
1595         .delete_ctrl            = nvmet_rdma_delete_ctrl,
1596         .disc_traddr            = nvmet_rdma_disc_port_addr,
1597 };
1598
1599 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1600 {
1601         struct nvmet_rdma_queue *queue, *tmp;
1602         struct nvmet_rdma_device *ndev;
1603         bool found = false;
1604
1605         mutex_lock(&device_list_mutex);
1606         list_for_each_entry(ndev, &device_list, entry) {
1607                 if (ndev->device == ib_device) {
1608                         found = true;
1609                         break;
1610                 }
1611         }
1612         mutex_unlock(&device_list_mutex);
1613
1614         if (!found)
1615                 return;
1616
1617         /*
1618          * IB Device that is used by nvmet controllers is being removed,
1619          * delete all queues using this device.
1620          */
1621         mutex_lock(&nvmet_rdma_queue_mutex);
1622         list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1623                                  queue_list) {
1624                 if (queue->dev->device != ib_device)
1625                         continue;
1626
1627                 pr_info("Removing queue %d\n", queue->idx);
1628                 list_del_init(&queue->queue_list);
1629                 __nvmet_rdma_queue_disconnect(queue);
1630         }
1631         mutex_unlock(&nvmet_rdma_queue_mutex);
1632
1633         flush_scheduled_work();
1634 }
1635
1636 static struct ib_client nvmet_rdma_ib_client = {
1637         .name   = "nvmet_rdma",
1638         .remove = nvmet_rdma_remove_one
1639 };
1640
1641 static int __init nvmet_rdma_init(void)
1642 {
1643         int ret;
1644
1645         ret = ib_register_client(&nvmet_rdma_ib_client);
1646         if (ret)
1647                 return ret;
1648
1649         ret = nvmet_register_transport(&nvmet_rdma_ops);
1650         if (ret)
1651                 goto err_ib_client;
1652
1653         nvmet_rdma_delete_wq = alloc_workqueue("nvmet-rdma-delete-wq",
1654                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
1655         if (!nvmet_rdma_delete_wq) {
1656                 ret = -ENOMEM;
1657                 goto err_unreg_transport;
1658         }
1659
1660         return 0;
1661
1662 err_unreg_transport:
1663         nvmet_unregister_transport(&nvmet_rdma_ops);
1664 err_ib_client:
1665         ib_unregister_client(&nvmet_rdma_ib_client);
1666         return ret;
1667 }
1668
1669 static void __exit nvmet_rdma_exit(void)
1670 {
1671         destroy_workqueue(nvmet_rdma_delete_wq);
1672         nvmet_unregister_transport(&nvmet_rdma_ops);
1673         ib_unregister_client(&nvmet_rdma_ib_client);
1674         WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1675         ida_destroy(&nvmet_rdma_queue_ida);
1676 }
1677
1678 module_init(nvmet_rdma_init);
1679 module_exit(nvmet_rdma_exit);
1680
1681 MODULE_LICENSE("GPL v2");
1682 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */