drivers/nvme/target/rdma.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVMe over Fabrics RDMA target.
   4  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5  */
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7 #include <linux/atomic.h>
   8 #include <linux/ctype.h>
   9 #include <linux/delay.h>
  10 #include <linux/err.h>
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/nvme.h>
  14 #include <linux/slab.h>
  15 #include <linux/string.h>
  16 #include <linux/wait.h>
  17 #include <linux/inet.h>
  18 #include <asm/unaligned.h>
  19
  20 #include <rdma/ib_verbs.h>
  21 #include <rdma/rdma_cm.h>
  22 #include <rdma/rw.h>
  23
  24 #include <linux/nvme-rdma.h>
  25 #include "nvmet.h"
  26
  27 /*
  28  * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  29  */
  30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE     PAGE_SIZE
  31 #define NVMET_RDMA_MAX_INLINE_SGE               4
  32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE         max_t(int, SZ_16K, PAGE_SIZE)
  33
  34 struct nvmet_rdma_cmd {
  35         struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
  36         struct ib_cqe           cqe;
  37         struct ib_recv_wr       wr;
  38         struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
  39         struct nvme_command     *nvme_cmd;
  40         struct nvmet_rdma_queue *queue;
  41 };
  42
  43 enum {
  44         NVMET_RDMA_REQ_INLINE_DATA      = (1 << 0),
  45         NVMET_RDMA_REQ_INVALIDATE_RKEY  = (1 << 1),
  46 };
  47
  48 struct nvmet_rdma_rsp {
  49         struct ib_sge           send_sge;
  50         struct ib_cqe           send_cqe;
  51         struct ib_send_wr       send_wr;
  52
  53         struct nvmet_rdma_cmd   *cmd;
  54         struct nvmet_rdma_queue *queue;
  55
  56         struct ib_cqe           read_cqe;
  57         struct rdma_rw_ctx      rw;
  58
  59         struct nvmet_req        req;
  60
  61         bool                    allocated;
  62         u8                      n_rdma;
  63         u32                     flags;
  64         u32                     invalidate_rkey;
  65
  66         struct list_head        wait_list;
  67         struct list_head        free_list;
  68 };
  69
  70 enum nvmet_rdma_queue_state {
  71         NVMET_RDMA_Q_CONNECTING,
  72         NVMET_RDMA_Q_LIVE,
  73         NVMET_RDMA_Q_DISCONNECTING,
  74 };
  75
  76 struct nvmet_rdma_queue {
  77         struct rdma_cm_id       *cm_id;
  78         struct nvmet_port       *port;
  79         struct ib_cq            *cq;
  80         atomic_t                sq_wr_avail;
  81         struct nvmet_rdma_device *dev;
  82         spinlock_t              state_lock;
  83         enum nvmet_rdma_queue_state state;
  84         struct nvmet_cq         nvme_cq;
  85         struct nvmet_sq         nvme_sq;
  86
  87         struct nvmet_rdma_rsp   *rsps;
  88         struct list_head        free_rsps;
  89         spinlock_t              rsps_lock;
  90         struct nvmet_rdma_cmd   *cmds;
  91
  92         struct work_struct      release_work;
  93         struct list_head        rsp_wait_list;
  94         struct list_head        rsp_wr_wait_list;
  95         spinlock_t              rsp_wr_wait_lock;
  96
  97         int                     idx;
  98         int                     host_qid;
  99         int                     recv_queue_size;
 100         int                     send_queue_size;
 101
 102         struct list_head        queue_list;
 103 };
 104
 105 struct nvmet_rdma_device {
 106         struct ib_device        *device;
 107         struct ib_pd            *pd;
 108         struct ib_srq           *srq;
 109         struct nvmet_rdma_cmd   *srq_cmds;
 110         size_t                  srq_size;
 111         struct kref             ref;
 112         struct list_head        entry;
 113         int                     inline_data_size;
 114         int                     inline_page_count;
 115 };
 116
 117 static bool nvmet_rdma_use_srq;
 118 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
 119 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
 120
 121 static DEFINE_IDA(nvmet_rdma_queue_ida);
 122 static LIST_HEAD(nvmet_rdma_queue_list);
 123 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
 124
 125 static LIST_HEAD(device_list);
 126 static DEFINE_MUTEX(device_list_mutex);
 127
 128 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
 129 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
 130 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 131 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
 132 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
 133 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
 134 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 135                                 struct nvmet_rdma_rsp *r);
 136 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 137                                 struct nvmet_rdma_rsp *r);
 138
 139 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 140
 141 static int num_pages(int len)
 142 {
 143         return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
 144 }
 145
 146 /* XXX: really should move to a generic header sooner or later.. */
 147 static inline u32 get_unaligned_le24(const u8 *p)
 148 {
 149         return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
 150 }
 151
 152 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
 153 {
 154         return nvme_is_write(rsp->req.cmd) &&
 155                 rsp->req.transfer_len &&
 156                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 157 }
 158
 159 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
 160 {
 161         return !nvme_is_write(rsp->req.cmd) &&
 162                 rsp->req.transfer_len &&
 163                 !rsp->req.rsp->status &&
 164                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 165 }
 166
 167 static inline struct nvmet_rdma_rsp *
 168 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
 169 {
 170         struct nvmet_rdma_rsp *rsp;
 171         unsigned long flags;
 172
 173         spin_lock_irqsave(&queue->rsps_lock, flags);
 174         rsp = list_first_entry_or_null(&queue->free_rsps,
 175                                 struct nvmet_rdma_rsp, free_list);
 176         if (likely(rsp))
 177                 list_del(&rsp->free_list);
 178         spin_unlock_irqrestore(&queue->rsps_lock, flags);
 179
 180         if (unlikely(!rsp)) {
 181                 int ret;
 182
 183                 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
 184                 if (unlikely(!rsp))
 185                         return NULL;
 186                 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
 187                 if (unlikely(ret)) {
 188                         kfree(rsp);
 189                         return NULL;
 190                 }
 191
 192                 rsp->allocated = true;
 193         }
 194
 195         return rsp;
 196 }
 197
 198 static inline void
 199 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 200 {
 201         unsigned long flags;
 202
 203         if (unlikely(rsp->allocated)) {
 204                 nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
 205                 kfree(rsp);
 206                 return;
 207         }
 208
 209         spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
 210         list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
 211         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 212 }
 213
 214 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
 215                                 struct nvmet_rdma_cmd *c)
 216 {
 217         struct scatterlist *sg;
 218         struct ib_sge *sge;
 219         int i;
 220
 221         if (!ndev->inline_data_size)
 222                 return;
 223
 224         sg = c->inline_sg;
 225         sge = &c->sge[1];
 226
 227         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 228                 if (sge->length)
 229                         ib_dma_unmap_page(ndev->device, sge->addr,
 230                                         sge->length, DMA_FROM_DEVICE);
 231                 if (sg_page(sg))
 232                         __free_page(sg_page(sg));
 233         }
 234 }
 235
 236 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
 237                                 struct nvmet_rdma_cmd *c)
 238 {
 239         struct scatterlist *sg;
 240         struct ib_sge *sge;
 241         struct page *pg;
 242         int len;
 243         int i;
 244
 245         if (!ndev->inline_data_size)
 246                 return 0;
 247
 248         sg = c->inline_sg;
 249         sg_init_table(sg, ndev->inline_page_count);
 250         sge = &c->sge[1];
 251         len = ndev->inline_data_size;
 252
 253         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 254                 pg = alloc_page(GFP_KERNEL);
 255                 if (!pg)
 256                         goto out_err;
 257                 sg_assign_page(sg, pg);
 258                 sge->addr = ib_dma_map_page(ndev->device,
 259                         pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
 260                 if (ib_dma_mapping_error(ndev->device, sge->addr))
 261                         goto out_err;
 262                 sge->length = min_t(int, len, PAGE_SIZE);
 263                 sge->lkey = ndev->pd->local_dma_lkey;
 264                 len -= sge->length;
 265         }
 266
 267         return 0;
 268 out_err:
 269         for (; i >= 0; i--, sg--, sge--) {
 270                 if (sge->length)
 271                         ib_dma_unmap_page(ndev->device, sge->addr,
 272                                         sge->length, DMA_FROM_DEVICE);
 273                 if (sg_page(sg))
 274                         __free_page(sg_page(sg));
 275         }
 276         return -ENOMEM;
 277 }
 278
 279 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
 280                         struct nvmet_rdma_cmd *c, bool admin)
 281 {
 282         /* NVMe command / RDMA RECV */
 283         c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
 284         if (!c->nvme_cmd)
 285                 goto out;
 286
 287         c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
 288                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 289         if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
 290                 goto out_free_cmd;
 291
 292         c->sge[0].length = sizeof(*c->nvme_cmd);
 293         c->sge[0].lkey = ndev->pd->local_dma_lkey;
 294
 295         if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
 296                 goto out_unmap_cmd;
 297
 298         c->cqe.done = nvmet_rdma_recv_done;
 299
 300         c->wr.wr_cqe = &c->cqe;
 301         c->wr.sg_list = c->sge;
 302         c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
 303
 304         return 0;
 305
 306 out_unmap_cmd:
 307         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 308                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 309 out_free_cmd:
 310         kfree(c->nvme_cmd);
 311
 312 out:
 313         return -ENOMEM;
 314 }
 315
 316 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
 317                 struct nvmet_rdma_cmd *c, bool admin)
 318 {
 319         if (!admin)
 320                 nvmet_rdma_free_inline_pages(ndev, c);
 321         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 322                                 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 323         kfree(c->nvme_cmd);
 324 }
 325
 326 static struct nvmet_rdma_cmd *
 327 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
 328                 int nr_cmds, bool admin)
 329 {
 330         struct nvmet_rdma_cmd *cmds;
 331         int ret = -EINVAL, i;
 332
 333         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
 334         if (!cmds)
 335                 goto out;
 336
 337         for (i = 0; i < nr_cmds; i++) {
 338                 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
 339                 if (ret)
 340                         goto out_free;
 341         }
 342
 343         return cmds;
 344
 345 out_free:
 346         while (--i >= 0)
 347                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 348         kfree(cmds);
 349 out:
 350         return ERR_PTR(ret);
 351 }
 352
 353 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
 354                 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
 355 {
 356         int i;
 357
 358         for (i = 0; i < nr_cmds; i++)
 359                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 360         kfree(cmds);
 361 }
 362
 363 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 364                 struct nvmet_rdma_rsp *r)
 365 {
 366         /* NVMe CQE / RDMA SEND */
 367         r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL);
 368         if (!r->req.rsp)
 369                 goto out;
 370
 371         r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp,
 372                         sizeof(*r->req.rsp), DMA_TO_DEVICE);
 373         if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
 374                 goto out_free_rsp;
 375
 376         r->send_sge.length = sizeof(*r->req.rsp);
 377         r->send_sge.lkey = ndev->pd->local_dma_lkey;
 378
 379         r->send_cqe.done = nvmet_rdma_send_done;
 380
 381         r->send_wr.wr_cqe = &r->send_cqe;
 382         r->send_wr.sg_list = &r->send_sge;
 383         r->send_wr.num_sge = 1;
 384         r->send_wr.send_flags = IB_SEND_SIGNALED;
 385
 386         /* Data In / RDMA READ */
 387         r->read_cqe.done = nvmet_rdma_read_data_done;
 388         return 0;
 389
 390 out_free_rsp:
 391         kfree(r->req.rsp);
 392 out:
 393         return -ENOMEM;
 394 }
 395
 396 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 397                 struct nvmet_rdma_rsp *r)
 398 {
 399         ib_dma_unmap_single(ndev->device, r->send_sge.addr,
 400                                 sizeof(*r->req.rsp), DMA_TO_DEVICE);
 401         kfree(r->req.rsp);
 402 }
 403
 404 static int
 405 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
 406 {
 407         struct nvmet_rdma_device *ndev = queue->dev;
 408         int nr_rsps = queue->recv_queue_size * 2;
 409         int ret = -EINVAL, i;
 410
 411         queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
 412                         GFP_KERNEL);
 413         if (!queue->rsps)
 414                 goto out;
 415
 416         for (i = 0; i < nr_rsps; i++) {
 417                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 418
 419                 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
 420                 if (ret)
 421                         goto out_free;
 422
 423                 list_add_tail(&rsp->free_list, &queue->free_rsps);
 424         }
 425
 426         return 0;
 427
 428 out_free:
 429         while (--i >= 0) {
 430                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 431
 432                 list_del(&rsp->free_list);
 433                 nvmet_rdma_free_rsp(ndev, rsp);
 434         }
 435         kfree(queue->rsps);
 436 out:
 437         return ret;
 438 }
 439
 440 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
 441 {
 442         struct nvmet_rdma_device *ndev = queue->dev;
 443         int i, nr_rsps = queue->recv_queue_size * 2;
 444
 445         for (i = 0; i < nr_rsps; i++) {
 446                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 447
 448                 list_del(&rsp->free_list);
 449                 nvmet_rdma_free_rsp(ndev, rsp);
 450         }
 451         kfree(queue->rsps);
 452 }
 453
 454 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
 455                 struct nvmet_rdma_cmd *cmd)
 456 {
 457         int ret;
 458
 459         ib_dma_sync_single_for_device(ndev->device,
 460                 cmd->sge[0].addr, cmd->sge[0].length,
 461                 DMA_FROM_DEVICE);
 462
 463         if (ndev->srq)
 464                 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
 465         else
 466                 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
 467
 468         if (unlikely(ret))
 469                 pr_err("post_recv cmd failed\n");
 470
 471         return ret;
 472 }
 473
 474 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
 475 {
 476         spin_lock(&queue->rsp_wr_wait_lock);
 477         while (!list_empty(&queue->rsp_wr_wait_list)) {
 478                 struct nvmet_rdma_rsp *rsp;
 479                 bool ret;
 480
 481                 rsp = list_entry(queue->rsp_wr_wait_list.next,
 482                                 struct nvmet_rdma_rsp, wait_list);
 483                 list_del(&rsp->wait_list);
 484
 485                 spin_unlock(&queue->rsp_wr_wait_lock);
 486                 ret = nvmet_rdma_execute_command(rsp);
 487                 spin_lock(&queue->rsp_wr_wait_lock);
 488
 489                 if (!ret) {
 490                         list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
 491                         break;
 492                 }
 493         }
 494         spin_unlock(&queue->rsp_wr_wait_lock);
 495 }
 496
 497
 498 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
 499 {
 500         struct nvmet_rdma_queue *queue = rsp->queue;
 501
 502         atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 503
 504         if (rsp->n_rdma) {
 505                 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
 506                                 queue->cm_id->port_num, rsp->req.sg,
 507                                 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 508         }
 509
 510         if (rsp->req.sg != rsp->cmd->inline_sg)
 511                 nvmet_req_free_sgl(&rsp->req);
 512
 513         if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
 514                 nvmet_rdma_process_wr_wait_list(queue);
 515
 516         nvmet_rdma_put_rsp(rsp);
 517 }
 518
 519 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
 520 {
 521         if (queue->nvme_sq.ctrl) {
 522                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
 523         } else {
 524                 /*
 525                  * we didn't setup the controller yet in case
 526                  * of admin connect error, just disconnect and
 527                  * cleanup the queue
 528                  */
 529                 nvmet_rdma_queue_disconnect(queue);
 530         }
 531 }
 532
 533 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 534 {
 535         struct nvmet_rdma_rsp *rsp =
 536                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
 537         struct nvmet_rdma_queue *queue = cq->cq_context;
 538
 539         nvmet_rdma_release_rsp(rsp);
 540
 541         if (unlikely(wc->status != IB_WC_SUCCESS &&
 542                      wc->status != IB_WC_WR_FLUSH_ERR)) {
 543                 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
 544                         wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 545                 nvmet_rdma_error_comp(queue);
 546         }
 547 }
 548
 549 static void nvmet_rdma_queue_response(struct nvmet_req *req)
 550 {
 551         struct nvmet_rdma_rsp *rsp =
 552                 container_of(req, struct nvmet_rdma_rsp, req);
 553         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 554         struct ib_send_wr *first_wr;
 555
 556         if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
 557                 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
 558                 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
 559         } else {
 560                 rsp->send_wr.opcode = IB_WR_SEND;
 561         }
 562
 563         if (nvmet_rdma_need_data_out(rsp))
 564                 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
 565                                 cm_id->port_num, NULL, &rsp->send_wr);
 566         else
 567                 first_wr = &rsp->send_wr;
 568
 569         nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
 570
 571         ib_dma_sync_single_for_device(rsp->queue->dev->device,
 572                 rsp->send_sge.addr, rsp->send_sge.length,
 573                 DMA_TO_DEVICE);
 574
 575         if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
 576                 pr_err("sending cmd response failed\n");
 577                 nvmet_rdma_release_rsp(rsp);
 578         }
 579 }
 580
 581 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 582 {
 583         struct nvmet_rdma_rsp *rsp =
 584                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
 585         struct nvmet_rdma_queue *queue = cq->cq_context;
 586
 587         WARN_ON(rsp->n_rdma <= 0);
 588         atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
 589         rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
 590                         queue->cm_id->port_num, rsp->req.sg,
 591                         rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 592         rsp->n_rdma = 0;
 593
 594         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 595                 nvmet_req_uninit(&rsp->req);
 596                 nvmet_rdma_release_rsp(rsp);
 597                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 598                         pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
 599                                 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 600                         nvmet_rdma_error_comp(queue);
 601                 }
 602                 return;
 603         }
 604
 605         nvmet_req_execute(&rsp->req);
 606 }
 607
 608 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
 609                 u64 off)
 610 {
 611         int sg_count = num_pages(len);
 612         struct scatterlist *sg;
 613         int i;
 614
 615         sg = rsp->cmd->inline_sg;
 616         for (i = 0; i < sg_count; i++, sg++) {
 617                 if (i < sg_count - 1)
 618                         sg_unmark_end(sg);
 619                 else
 620                         sg_mark_end(sg);
 621                 sg->offset = off;
 622                 sg->length = min_t(int, len, PAGE_SIZE - off);
 623                 len -= sg->length;
 624                 if (!i)
 625                         off = 0;
 626         }
 627
 628         rsp->req.sg = rsp->cmd->inline_sg;
 629         rsp->req.sg_cnt = sg_count;
 630 }
 631
 632 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 633 {
 634         struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
 635         u64 off = le64_to_cpu(sgl->addr);
 636         u32 len = le32_to_cpu(sgl->length);
 637
 638         if (!nvme_is_write(rsp->req.cmd)) {
 639                 rsp->req.error_loc =
 640                         offsetof(struct nvme_common_command, opcode);
 641                 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 642         }
 643
 644         if (off + len > rsp->queue->dev->inline_data_size) {
 645                 pr_err("invalid inline data offset!\n");
 646                 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
 647         }
 648
 649         /* no data command? */
 650         if (!len)
 651                 return 0;
 652
 653         nvmet_rdma_use_inline_sg(rsp, len, off);
 654         rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
 655         rsp->req.transfer_len += len;
 656         return 0;
 657 }
 658
 659 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 660                 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
 661 {
 662         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 663         u64 addr = le64_to_cpu(sgl->addr);
 664         u32 key = get_unaligned_le32(sgl->key);
 665         int ret;
 666
 667         rsp->req.transfer_len = get_unaligned_le24(sgl->length);
 668
 669         /* no data command? */
 670         if (!rsp->req.transfer_len)
 671                 return 0;
 672
 673         ret = nvmet_req_alloc_sgl(&rsp->req);
 674         if (ret < 0)
 675                 goto error_out;
 676
 677         ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
 678                         rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
 679                         nvmet_data_dir(&rsp->req));
 680         if (ret < 0)
 681                 goto error_out;
 682         rsp->n_rdma += ret;
 683
 684         if (invalidate) {
 685                 rsp->invalidate_rkey = key;
 686                 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
 687         }
 688
 689         return 0;
 690
 691 error_out:
 692         rsp->req.transfer_len = 0;
 693         return NVME_SC_INTERNAL;
 694 }
 695
 696 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 697 {
 698         struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
 699
 700         switch (sgl->type >> 4) {
 701         case NVME_SGL_FMT_DATA_DESC:
 702                 switch (sgl->type & 0xf) {
 703                 case NVME_SGL_FMT_OFFSET:
 704                         return nvmet_rdma_map_sgl_inline(rsp);
 705                 default:
 706                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 707                         rsp->req.error_loc =
 708                                 offsetof(struct nvme_common_command, dptr);
 709                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 710                 }
 711         case NVME_KEY_SGL_FMT_DATA_DESC:
 712                 switch (sgl->type & 0xf) {
 713                 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
 714                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
 715                 case NVME_SGL_FMT_ADDRESS:
 716                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
 717                 default:
 718                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 719                         rsp->req.error_loc =
 720                                 offsetof(struct nvme_common_command, dptr);
 721                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 722                 }
 723         default:
 724                 pr_err("invalid SGL type: %#x\n", sgl->type);
 725                 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
 726                 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
 727         }
 728 }
 729
 730 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
 731 {
 732         struct nvmet_rdma_queue *queue = rsp->queue;
 733
 734         if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
 735                         &queue->sq_wr_avail) < 0)) {
 736                 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
 737                                 1 + rsp->n_rdma, queue->idx,
 738                                 queue->nvme_sq.ctrl->cntlid);
 739                 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 740                 return false;
 741         }
 742
 743         if (nvmet_rdma_need_data_in(rsp)) {
 744                 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
 745                                 queue->cm_id->port_num, &rsp->read_cqe, NULL))
 746                         nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
 747         } else {
 748                 nvmet_req_execute(&rsp->req);
 749         }
 750
 751         return true;
 752 }
 753
 754 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
 755                 struct nvmet_rdma_rsp *cmd)
 756 {
 757         u16 status;
 758
 759         ib_dma_sync_single_for_cpu(queue->dev->device,
 760                 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
 761                 DMA_FROM_DEVICE);
 762         ib_dma_sync_single_for_cpu(queue->dev->device,
 763                 cmd->send_sge.addr, cmd->send_sge.length,
 764                 DMA_TO_DEVICE);
 765
 766         cmd->req.p2p_client = &queue->dev->device->dev;
 767
 768         if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
 769                         &queue->nvme_sq, &nvmet_rdma_ops))
 770                 return;
 771
 772         status = nvmet_rdma_map_sgl(cmd);
 773         if (status)
 774                 goto out_err;
 775
 776         if (unlikely(!nvmet_rdma_execute_command(cmd))) {
 777                 spin_lock(&queue->rsp_wr_wait_lock);
 778                 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
 779                 spin_unlock(&queue->rsp_wr_wait_lock);
 780         }
 781
 782         return;
 783
 784 out_err:
 785         nvmet_req_complete(&cmd->req, status);
 786 }
 787
 788 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 789 {
 790         struct nvmet_rdma_cmd *cmd =
 791                 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
 792         struct nvmet_rdma_queue *queue = cq->cq_context;
 793         struct nvmet_rdma_rsp *rsp;
 794
 795         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 796                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 797                         pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
 798                                 wc->wr_cqe, ib_wc_status_msg(wc->status),
 799                                 wc->status);
 800                         nvmet_rdma_error_comp(queue);
 801                 }
 802                 return;
 803         }
 804
 805         if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
 806                 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
 807                 nvmet_rdma_error_comp(queue);
 808                 return;
 809         }
 810
 811         cmd->queue = queue;
 812         rsp = nvmet_rdma_get_rsp(queue);
 813         if (unlikely(!rsp)) {
 814                 /*
 815                  * we get here only under memory pressure,
 816                  * silently drop and have the host retry
 817                  * as we can't even fail it.
 818                  */
 819                 nvmet_rdma_post_recv(queue->dev, cmd);
 820                 return;
 821         }
 822         rsp->queue = queue;
 823         rsp->cmd = cmd;
 824         rsp->flags = 0;
 825         rsp->req.cmd = cmd->nvme_cmd;
 826         rsp->req.port = queue->port;
 827         rsp->n_rdma = 0;
 828
 829         if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
 830                 unsigned long flags;
 831
 832                 spin_lock_irqsave(&queue->state_lock, flags);
 833                 if (queue->state == NVMET_RDMA_Q_CONNECTING)
 834                         list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
 835                 else
 836                         nvmet_rdma_put_rsp(rsp);
 837                 spin_unlock_irqrestore(&queue->state_lock, flags);
 838                 return;
 839         }
 840
 841         nvmet_rdma_handle_command(queue, rsp);
 842 }
 843
 844 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
 845 {
 846         if (!ndev->srq)
 847                 return;
 848
 849         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 850         ib_destroy_srq(ndev->srq);
 851 }
 852
 853 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
 854 {
 855         struct ib_srq_init_attr srq_attr = { NULL, };
 856         struct ib_srq *srq;
 857         size_t srq_size;
 858         int ret, i;
 859
 860         srq_size = 4095;        /* XXX: tune */
 861
 862         srq_attr.attr.max_wr = srq_size;
 863         srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
 864         srq_attr.attr.srq_limit = 0;
 865         srq_attr.srq_type = IB_SRQT_BASIC;
 866         srq = ib_create_srq(ndev->pd, &srq_attr);
 867         if (IS_ERR(srq)) {
 868                 /*
 869                  * If SRQs aren't supported we just go ahead and use normal
 870                  * non-shared receive queues.
 871                  */
 872                 pr_info("SRQ requested but not supported.\n");
 873                 return 0;
 874         }
 875
 876         ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
 877         if (IS_ERR(ndev->srq_cmds)) {
 878                 ret = PTR_ERR(ndev->srq_cmds);
 879                 goto out_destroy_srq;
 880         }
 881
 882         ndev->srq = srq;
 883         ndev->srq_size = srq_size;
 884
 885         for (i = 0; i < srq_size; i++) {
 886                 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
 887                 if (ret)
 888                         goto out_free_cmds;
 889         }
 890
 891         return 0;
 892
 893 out_free_cmds:
 894         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 895 out_destroy_srq:
 896         ib_destroy_srq(srq);
 897         return ret;
 898 }
 899
 900 static void nvmet_rdma_free_dev(struct kref *ref)
 901 {
 902         struct nvmet_rdma_device *ndev =
 903                 container_of(ref, struct nvmet_rdma_device, ref);
 904
 905         mutex_lock(&device_list_mutex);
 906         list_del(&ndev->entry);
 907         mutex_unlock(&device_list_mutex);
 908
 909         nvmet_rdma_destroy_srq(ndev);
 910         ib_dealloc_pd(ndev->pd);
 911
 912         kfree(ndev);
 913 }
 914
 915 static struct nvmet_rdma_device *
 916 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 917 {
 918         struct nvmet_port *port = cm_id->context;
 919         struct nvmet_rdma_device *ndev;
 920         int inline_page_count;
 921         int inline_sge_count;
 922         int ret;
 923
 924         mutex_lock(&device_list_mutex);
 925         list_for_each_entry(ndev, &device_list, entry) {
 926                 if (ndev->device->node_guid == cm_id->device->node_guid &&
 927                     kref_get_unless_zero(&ndev->ref))
 928                         goto out_unlock;
 929         }
 930
 931         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 932         if (!ndev)
 933                 goto out_err;
 934
 935         inline_page_count = num_pages(port->inline_data_size);
 936         inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
 937                                 cm_id->device->attrs.max_recv_sge) - 1;
 938         if (inline_page_count > inline_sge_count) {
 939                 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
 940                         port->inline_data_size, cm_id->device->name,
 941                         inline_sge_count * PAGE_SIZE);
 942                 port->inline_data_size = inline_sge_count * PAGE_SIZE;
 943                 inline_page_count = inline_sge_count;
 944         }
 945         ndev->inline_data_size = port->inline_data_size;
 946         ndev->inline_page_count = inline_page_count;
 947         ndev->device = cm_id->device;
 948         kref_init(&ndev->ref);
 949
 950         ndev->pd = ib_alloc_pd(ndev->device, 0);
 951         if (IS_ERR(ndev->pd))
 952                 goto out_free_dev;
 953
 954         if (nvmet_rdma_use_srq) {
 955                 ret = nvmet_rdma_init_srq(ndev);
 956                 if (ret)
 957                         goto out_free_pd;
 958         }
 959
 960         list_add(&ndev->entry, &device_list);
 961 out_unlock:
 962         mutex_unlock(&device_list_mutex);
 963         pr_debug("added %s.\n", ndev->device->name);
 964         return ndev;
 965
 966 out_free_pd:
 967         ib_dealloc_pd(ndev->pd);
 968 out_free_dev:
 969         kfree(ndev);
 970 out_err:
 971         mutex_unlock(&device_list_mutex);
 972         return NULL;
 973 }
 974
 975 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
 976 {
 977         struct ib_qp_init_attr qp_attr;
 978         struct nvmet_rdma_device *ndev = queue->dev;
 979         int comp_vector, nr_cqe, ret, i;
 980
 981         /*
 982          * Spread the io queues across completion vectors,
 983          * but still keep all admin queues on vector 0.
 984          */
 985         comp_vector = !queue->host_qid ? 0 :
 986                 queue->idx % ndev->device->num_comp_vectors;
 987
 988         /*
 989          * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
 990          */
 991         nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
 992
 993         queue->cq = ib_alloc_cq(ndev->device, queue,
 994                         nr_cqe + 1, comp_vector,
 995                         IB_POLL_WORKQUEUE);
 996         if (IS_ERR(queue->cq)) {
 997                 ret = PTR_ERR(queue->cq);
 998                 pr_err("failed to create CQ cqe= %d ret= %d\n",
 999                        nr_cqe + 1, ret);
1000                 goto out;
1001         }
1002
1003         memset(&qp_attr, 0, sizeof(qp_attr));
1004         qp_attr.qp_context = queue;
1005         qp_attr.event_handler = nvmet_rdma_qp_event;
1006         qp_attr.send_cq = queue->cq;
1007         qp_attr.recv_cq = queue->cq;
1008         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1009         qp_attr.qp_type = IB_QPT_RC;
1010         /* +1 for drain */
1011         qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
1012         qp_attr.cap.max_rdma_ctxs = queue->send_queue_size;
1013         qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
1014                                         ndev->device->attrs.max_send_sge);
1015
1016         if (ndev->srq) {
1017                 qp_attr.srq = ndev->srq;
1018         } else {
1019                 /* +1 for drain */
1020                 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1021                 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1022         }
1023
1024         ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1025         if (ret) {
1026                 pr_err("failed to create_qp ret= %d\n", ret);
1027                 goto err_destroy_cq;
1028         }
1029
1030         atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1031
1032         pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1033                  __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1034                  qp_attr.cap.max_send_wr, queue->cm_id);
1035
1036         if (!ndev->srq) {
1037                 for (i = 0; i < queue->recv_queue_size; i++) {
1038                         queue->cmds[i].queue = queue;
1039                         ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1040                         if (ret)
1041                                 goto err_destroy_qp;
1042                 }
1043         }
1044
1045 out:
1046         return ret;
1047
1048 err_destroy_qp:
1049         rdma_destroy_qp(queue->cm_id);
1050 err_destroy_cq:
1051         ib_free_cq(queue->cq);
1052         goto out;
1053 }
1054
1055 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1056 {
1057         struct ib_qp *qp = queue->cm_id->qp;
1058
1059         ib_drain_qp(qp);
1060         rdma_destroy_id(queue->cm_id);
1061         ib_destroy_qp(qp);
1062         ib_free_cq(queue->cq);
1063 }
1064
1065 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1066 {
1067         pr_debug("freeing queue %d\n", queue->idx);
1068
1069         nvmet_sq_destroy(&queue->nvme_sq);
1070
1071         nvmet_rdma_destroy_queue_ib(queue);
1072         if (!queue->dev->srq) {
1073                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1074                                 queue->recv_queue_size,
1075                                 !queue->host_qid);
1076         }
1077         nvmet_rdma_free_rsps(queue);
1078         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1079         kfree(queue);
1080 }
1081
1082 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1083 {
1084         struct nvmet_rdma_queue *queue =
1085                 container_of(w, struct nvmet_rdma_queue, release_work);
1086         struct nvmet_rdma_device *dev = queue->dev;
1087
1088         nvmet_rdma_free_queue(queue);
1089
1090         kref_put(&dev->ref, nvmet_rdma_free_dev);
1091 }
1092
1093 static int
1094 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1095                                 struct nvmet_rdma_queue *queue)
1096 {
1097         struct nvme_rdma_cm_req *req;
1098
1099         req = (struct nvme_rdma_cm_req *)conn->private_data;
1100         if (!req || conn->private_data_len == 0)
1101                 return NVME_RDMA_CM_INVALID_LEN;
1102
1103         if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1104                 return NVME_RDMA_CM_INVALID_RECFMT;
1105
1106         queue->host_qid = le16_to_cpu(req->qid);
1107
1108         /*
1109          * req->hsqsize corresponds to our recv queue size plus 1
1110          * req->hrqsize corresponds to our send queue size
1111          */
1112         queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1113         queue->send_queue_size = le16_to_cpu(req->hrqsize);
1114
1115         if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1116                 return NVME_RDMA_CM_INVALID_HSQSIZE;
1117
1118         /* XXX: Should we enforce some kind of max for IO queues? */
1119
1120         return 0;
1121 }
1122
1123 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1124                                 enum nvme_rdma_cm_status status)
1125 {
1126         struct nvme_rdma_cm_rej rej;
1127
1128         pr_debug("rejecting connect request: status %d (%s)\n",
1129                  status, nvme_rdma_cm_msg(status));
1130
1131         rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1132         rej.sts = cpu_to_le16(status);
1133
1134         return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1135 }
1136
1137 static struct nvmet_rdma_queue *
1138 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1139                 struct rdma_cm_id *cm_id,
1140                 struct rdma_cm_event *event)
1141 {
1142         struct nvmet_rdma_queue *queue;
1143         int ret;
1144
1145         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1146         if (!queue) {
1147                 ret = NVME_RDMA_CM_NO_RSC;
1148                 goto out_reject;
1149         }
1150
1151         ret = nvmet_sq_init(&queue->nvme_sq);
1152         if (ret) {
1153                 ret = NVME_RDMA_CM_NO_RSC;
1154                 goto out_free_queue;
1155         }
1156
1157         ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1158         if (ret)
1159                 goto out_destroy_sq;
1160
1161         /*
1162          * Schedules the actual release because calling rdma_destroy_id from
1163          * inside a CM callback would trigger a deadlock. (great API design..)
1164          */
1165         INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1166         queue->dev = ndev;
1167         queue->cm_id = cm_id;
1168
1169         spin_lock_init(&queue->state_lock);
1170         queue->state = NVMET_RDMA_Q_CONNECTING;
1171         INIT_LIST_HEAD(&queue->rsp_wait_list);
1172         INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1173         spin_lock_init(&queue->rsp_wr_wait_lock);
1174         INIT_LIST_HEAD(&queue->free_rsps);
1175         spin_lock_init(&queue->rsps_lock);
1176         INIT_LIST_HEAD(&queue->queue_list);
1177
1178         queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1179         if (queue->idx < 0) {
1180                 ret = NVME_RDMA_CM_NO_RSC;
1181                 goto out_destroy_sq;
1182         }
1183
1184         ret = nvmet_rdma_alloc_rsps(queue);
1185         if (ret) {
1186                 ret = NVME_RDMA_CM_NO_RSC;
1187                 goto out_ida_remove;
1188         }
1189
1190         if (!ndev->srq) {
1191                 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1192                                 queue->recv_queue_size,
1193                                 !queue->host_qid);
1194                 if (IS_ERR(queue->cmds)) {
1195                         ret = NVME_RDMA_CM_NO_RSC;
1196                         goto out_free_responses;
1197                 }
1198         }
1199
1200         ret = nvmet_rdma_create_queue_ib(queue);
1201         if (ret) {
1202                 pr_err("%s: creating RDMA queue failed (%d).\n",
1203                         __func__, ret);
1204                 ret = NVME_RDMA_CM_NO_RSC;
1205                 goto out_free_cmds;
1206         }
1207
1208         return queue;
1209
1210 out_free_cmds:
1211         if (!ndev->srq) {
1212                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1213                                 queue->recv_queue_size,
1214                                 !queue->host_qid);
1215         }
1216 out_free_responses:
1217         nvmet_rdma_free_rsps(queue);
1218 out_ida_remove:
1219         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1220 out_destroy_sq:
1221         nvmet_sq_destroy(&queue->nvme_sq);
1222 out_free_queue:
1223         kfree(queue);
1224 out_reject:
1225         nvmet_rdma_cm_reject(cm_id, ret);
1226         return NULL;
1227 }
1228
1229 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1230 {
1231         struct nvmet_rdma_queue *queue = priv;
1232
1233         switch (event->event) {
1234         case IB_EVENT_COMM_EST:
1235                 rdma_notify(queue->cm_id, event->event);
1236                 break;
1237         default:
1238                 pr_err("received IB QP event: %s (%d)\n",
1239                        ib_event_msg(event->event), event->event);
1240                 break;
1241         }
1242 }
1243
1244 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1245                 struct nvmet_rdma_queue *queue,
1246                 struct rdma_conn_param *p)
1247 {
1248         struct rdma_conn_param  param = { };
1249         struct nvme_rdma_cm_rep priv = { };
1250         int ret = -ENOMEM;
1251
1252         param.rnr_retry_count = 7;
1253         param.flow_control = 1;
1254         param.initiator_depth = min_t(u8, p->initiator_depth,
1255                 queue->dev->device->attrs.max_qp_init_rd_atom);
1256         param.private_data = &priv;
1257         param.private_data_len = sizeof(priv);
1258         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1259         priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1260
1261         ret = rdma_accept(cm_id, &param);
1262         if (ret)
1263                 pr_err("rdma_accept failed (error code = %d)\n", ret);
1264
1265         return ret;
1266 }
1267
1268 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1269                 struct rdma_cm_event *event)
1270 {
1271         struct nvmet_rdma_device *ndev;
1272         struct nvmet_rdma_queue *queue;
1273         int ret = -EINVAL;
1274
1275         ndev = nvmet_rdma_find_get_device(cm_id);
1276         if (!ndev) {
1277                 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1278                 return -ECONNREFUSED;
1279         }
1280
1281         queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1282         if (!queue) {
1283                 ret = -ENOMEM;
1284                 goto put_device;
1285         }
1286         queue->port = cm_id->context;
1287
1288         if (queue->host_qid == 0) {
1289                 /* Let inflight controller teardown complete */
1290                 flush_scheduled_work();
1291         }
1292
1293         ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1294         if (ret) {
1295                 schedule_work(&queue->release_work);
1296                 /* Destroying rdma_cm id is not needed here */
1297                 return 0;
1298         }
1299
1300         mutex_lock(&nvmet_rdma_queue_mutex);
1301         list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1302         mutex_unlock(&nvmet_rdma_queue_mutex);
1303
1304         return 0;
1305
1306 put_device:
1307         kref_put(&ndev->ref, nvmet_rdma_free_dev);
1308
1309         return ret;
1310 }
1311
1312 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1313 {
1314         unsigned long flags;
1315
1316         spin_lock_irqsave(&queue->state_lock, flags);
1317         if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1318                 pr_warn("trying to establish a connected queue\n");
1319                 goto out_unlock;
1320         }
1321         queue->state = NVMET_RDMA_Q_LIVE;
1322
1323         while (!list_empty(&queue->rsp_wait_list)) {
1324                 struct nvmet_rdma_rsp *cmd;
1325
1326                 cmd = list_first_entry(&queue->rsp_wait_list,
1327                                         struct nvmet_rdma_rsp, wait_list);
1328                 list_del(&cmd->wait_list);
1329
1330                 spin_unlock_irqrestore(&queue->state_lock, flags);
1331                 nvmet_rdma_handle_command(queue, cmd);
1332                 spin_lock_irqsave(&queue->state_lock, flags);
1333         }
1334
1335 out_unlock:
1336         spin_unlock_irqrestore(&queue->state_lock, flags);
1337 }
1338
1339 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1340 {
1341         bool disconnect = false;
1342         unsigned long flags;
1343
1344         pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1345
1346         spin_lock_irqsave(&queue->state_lock, flags);
1347         switch (queue->state) {
1348         case NVMET_RDMA_Q_CONNECTING:
1349         case NVMET_RDMA_Q_LIVE:
1350                 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1351                 disconnect = true;
1352                 break;
1353         case NVMET_RDMA_Q_DISCONNECTING:
1354                 break;
1355         }
1356         spin_unlock_irqrestore(&queue->state_lock, flags);
1357
1358         if (disconnect) {
1359                 rdma_disconnect(queue->cm_id);
1360                 schedule_work(&queue->release_work);
1361         }
1362 }
1363
1364 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1365 {
1366         bool disconnect = false;
1367
1368         mutex_lock(&nvmet_rdma_queue_mutex);
1369         if (!list_empty(&queue->queue_list)) {
1370                 list_del_init(&queue->queue_list);
1371                 disconnect = true;
1372         }
1373         mutex_unlock(&nvmet_rdma_queue_mutex);
1374
1375         if (disconnect)
1376                 __nvmet_rdma_queue_disconnect(queue);
1377 }
1378
1379 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1380                 struct nvmet_rdma_queue *queue)
1381 {
1382         WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1383
1384         mutex_lock(&nvmet_rdma_queue_mutex);
1385         if (!list_empty(&queue->queue_list))
1386                 list_del_init(&queue->queue_list);
1387         mutex_unlock(&nvmet_rdma_queue_mutex);
1388
1389         pr_err("failed to connect queue %d\n", queue->idx);
1390         schedule_work(&queue->release_work);
1391 }
1392
1393 /**
1394  * nvme_rdma_device_removal() - Handle RDMA device removal
1395  * @cm_id:      rdma_cm id, used for nvmet port
1396  * @queue:      nvmet rdma queue (cm id qp_context)
1397  *
1398  * DEVICE_REMOVAL event notifies us that the RDMA device is about
1399  * to unplug. Note that this event can be generated on a normal
1400  * queue cm_id and/or a device bound listener cm_id (where in this
1401  * case queue will be null).
1402  *
1403  * We registered an ib_client to handle device removal for queues,
1404  * so we only need to handle the listening port cm_ids. In this case
1405  * we nullify the priv to prevent double cm_id destruction and destroying
1406  * the cm_id implicitely by returning a non-zero rc to the callout.
1407  */
1408 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1409                 struct nvmet_rdma_queue *queue)
1410 {
1411         struct nvmet_port *port;
1412
1413         if (queue) {
1414                 /*
1415                  * This is a queue cm_id. we have registered
1416                  * an ib_client to handle queues removal
1417                  * so don't interfear and just return.
1418                  */
1419                 return 0;
1420         }
1421
1422         port = cm_id->context;
1423
1424         /*
1425          * This is a listener cm_id. Make sure that
1426          * future remove_port won't invoke a double
1427          * cm_id destroy. use atomic xchg to make sure
1428          * we don't compete with remove_port.
1429          */
1430         if (xchg(&port->priv, NULL) != cm_id)
1431                 return 0;
1432
1433         /*
1434          * We need to return 1 so that the core will destroy
1435          * it's own ID.  What a great API design..
1436          */
1437         return 1;
1438 }
1439
1440 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1441                 struct rdma_cm_event *event)
1442 {
1443         struct nvmet_rdma_queue *queue = NULL;
1444         int ret = 0;
1445
1446         if (cm_id->qp)
1447                 queue = cm_id->qp->qp_context;
1448
1449         pr_debug("%s (%d): status %d id %p\n",
1450                 rdma_event_msg(event->event), event->event,
1451                 event->status, cm_id);
1452
1453         switch (event->event) {
1454         case RDMA_CM_EVENT_CONNECT_REQUEST:
1455                 ret = nvmet_rdma_queue_connect(cm_id, event);
1456                 break;
1457         case RDMA_CM_EVENT_ESTABLISHED:
1458                 nvmet_rdma_queue_established(queue);
1459                 break;
1460         case RDMA_CM_EVENT_ADDR_CHANGE:
1461         case RDMA_CM_EVENT_DISCONNECTED:
1462         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1463                 nvmet_rdma_queue_disconnect(queue);
1464                 break;
1465         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1466                 ret = nvmet_rdma_device_removal(cm_id, queue);
1467                 break;
1468         case RDMA_CM_EVENT_REJECTED:
1469                 pr_debug("Connection rejected: %s\n",
1470                          rdma_reject_msg(cm_id, event->status));
1471                 /* FALLTHROUGH */
1472         case RDMA_CM_EVENT_UNREACHABLE:
1473         case RDMA_CM_EVENT_CONNECT_ERROR:
1474                 nvmet_rdma_queue_connect_fail(cm_id, queue);
1475                 break;
1476         default:
1477                 pr_err("received unrecognized RDMA CM event %d\n",
1478                         event->event);
1479                 break;
1480         }
1481
1482         return ret;
1483 }
1484
1485 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1486 {
1487         struct nvmet_rdma_queue *queue;
1488
1489 restart:
1490         mutex_lock(&nvmet_rdma_queue_mutex);
1491         list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1492                 if (queue->nvme_sq.ctrl == ctrl) {
1493                         list_del_init(&queue->queue_list);
1494                         mutex_unlock(&nvmet_rdma_queue_mutex);
1495
1496                         __nvmet_rdma_queue_disconnect(queue);
1497                         goto restart;
1498                 }
1499         }
1500         mutex_unlock(&nvmet_rdma_queue_mutex);
1501 }
1502
1503 static int nvmet_rdma_add_port(struct nvmet_port *port)
1504 {
1505         struct rdma_cm_id *cm_id;
1506         struct sockaddr_storage addr = { };
1507         __kernel_sa_family_t af;
1508         int ret;
1509
1510         switch (port->disc_addr.adrfam) {
1511         case NVMF_ADDR_FAMILY_IP4:
1512                 af = AF_INET;
1513                 break;
1514         case NVMF_ADDR_FAMILY_IP6:
1515                 af = AF_INET6;
1516                 break;
1517         default:
1518                 pr_err("address family %d not supported\n",
1519                                 port->disc_addr.adrfam);
1520                 return -EINVAL;
1521         }
1522
1523         if (port->inline_data_size < 0) {
1524                 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1525         } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1526                 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1527                         port->inline_data_size,
1528                         NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1529                 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1530         }
1531
1532         ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
1533                         port->disc_addr.trsvcid, &addr);
1534         if (ret) {
1535                 pr_err("malformed ip/port passed: %s:%s\n",
1536                         port->disc_addr.traddr, port->disc_addr.trsvcid);
1537                 return ret;
1538         }
1539
1540         cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1541                         RDMA_PS_TCP, IB_QPT_RC);
1542         if (IS_ERR(cm_id)) {
1543                 pr_err("CM ID creation failed\n");
1544                 return PTR_ERR(cm_id);
1545         }
1546
1547         /*
1548          * Allow both IPv4 and IPv6 sockets to bind a single port
1549          * at the same time.
1550          */
1551         ret = rdma_set_afonly(cm_id, 1);
1552         if (ret) {
1553                 pr_err("rdma_set_afonly failed (%d)\n", ret);
1554                 goto out_destroy_id;
1555         }
1556
1557         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
1558         if (ret) {
1559                 pr_err("binding CM ID to %pISpcs failed (%d)\n",
1560                         (struct sockaddr *)&addr, ret);
1561                 goto out_destroy_id;
1562         }
1563
1564         ret = rdma_listen(cm_id, 128);
1565         if (ret) {
1566                 pr_err("listening to %pISpcs failed (%d)\n",
1567                         (struct sockaddr *)&addr, ret);
1568                 goto out_destroy_id;
1569         }
1570
1571         pr_info("enabling port %d (%pISpcs)\n",
1572                 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
1573         port->priv = cm_id;
1574         return 0;
1575
1576 out_destroy_id:
1577         rdma_destroy_id(cm_id);
1578         return ret;
1579 }
1580
1581 static void nvmet_rdma_remove_port(struct nvmet_port *port)
1582 {
1583         struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
1584
1585         if (cm_id)
1586                 rdma_destroy_id(cm_id);
1587 }
1588
1589 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1590                 struct nvmet_port *port, char *traddr)
1591 {
1592         struct rdma_cm_id *cm_id = port->priv;
1593
1594         if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1595                 struct nvmet_rdma_rsp *rsp =
1596                         container_of(req, struct nvmet_rdma_rsp, req);
1597                 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1598                 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1599
1600                 sprintf(traddr, "%pISc", addr);
1601         } else {
1602                 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE);
1603         }
1604 }
1605
1606 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1607         .owner                  = THIS_MODULE,
1608         .type                   = NVMF_TRTYPE_RDMA,
1609         .msdbd                  = 1,
1610         .has_keyed_sgls         = 1,
1611         .add_port               = nvmet_rdma_add_port,
1612         .remove_port            = nvmet_rdma_remove_port,
1613         .queue_response         = nvmet_rdma_queue_response,
1614         .delete_ctrl            = nvmet_rdma_delete_ctrl,
1615         .disc_traddr            = nvmet_rdma_disc_port_addr,
1616 };
1617
1618 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1619 {
1620         struct nvmet_rdma_queue *queue, *tmp;
1621         struct nvmet_rdma_device *ndev;
1622         bool found = false;
1623
1624         mutex_lock(&device_list_mutex);
1625         list_for_each_entry(ndev, &device_list, entry) {
1626                 if (ndev->device == ib_device) {
1627                         found = true;
1628                         break;
1629                 }
1630         }
1631         mutex_unlock(&device_list_mutex);
1632
1633         if (!found)
1634                 return;
1635
1636         /*
1637          * IB Device that is used by nvmet controllers is being removed,
1638          * delete all queues using this device.
1639          */
1640         mutex_lock(&nvmet_rdma_queue_mutex);
1641         list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1642                                  queue_list) {
1643                 if (queue->dev->device != ib_device)
1644                         continue;
1645
1646                 pr_info("Removing queue %d\n", queue->idx);
1647                 list_del_init(&queue->queue_list);
1648                 __nvmet_rdma_queue_disconnect(queue);
1649         }
1650         mutex_unlock(&nvmet_rdma_queue_mutex);
1651
1652         flush_scheduled_work();
1653 }
1654
1655 static struct ib_client nvmet_rdma_ib_client = {
1656         .name   = "nvmet_rdma",
1657         .remove = nvmet_rdma_remove_one
1658 };
1659
1660 static int __init nvmet_rdma_init(void)
1661 {
1662         int ret;
1663
1664         ret = ib_register_client(&nvmet_rdma_ib_client);
1665         if (ret)
1666                 return ret;
1667
1668         ret = nvmet_register_transport(&nvmet_rdma_ops);
1669         if (ret)
1670                 goto err_ib_client;
1671
1672         return 0;
1673
1674 err_ib_client:
1675         ib_unregister_client(&nvmet_rdma_ib_client);
1676         return ret;
1677 }
1678
1679 static void __exit nvmet_rdma_exit(void)
1680 {
1681         nvmet_unregister_transport(&nvmet_rdma_ops);
1682         ib_unregister_client(&nvmet_rdma_ib_client);
1683         WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1684         ida_destroy(&nvmet_rdma_queue_ida);
1685 }
1686
1687 module_init(nvmet_rdma_init);
1688 module_exit(nvmet_rdma_exit);
1689
1690 MODULE_LICENSE("GPL v2");
1691 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */