]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - net/sunrpc/xprtrdma/verbs.c
Linux 3.19-rc5
[mirror_ubuntu-hirsute-kernel.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
65866f82 52#include <asm/bitops.h>
c56c65fb 53
f58851e6
TT
54#include "xprt_rdma.h"
55
c56c65fb
TT
56/*
57 * Globals/Macros
58 */
59
f895b252 60#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
61# define RPCDBG_FACILITY RPCDBG_TRANS
62#endif
63
9f9d802a 64static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
467c9674 65static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
9f9d802a 66
c56c65fb
TT
67/*
68 * internal functions
69 */
70
71/*
72 * handle replies in tasklet context, using a single, global list
73 * rdma tasklet function -- just turn around and call the func
74 * for all replies on the list
75 */
76
77static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
78static LIST_HEAD(rpcrdma_tasklets_g);
79
80static void
81rpcrdma_run_tasklet(unsigned long data)
82{
83 struct rpcrdma_rep *rep;
84 void (*func)(struct rpcrdma_rep *);
85 unsigned long flags;
86
87 data = data;
88 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
89 while (!list_empty(&rpcrdma_tasklets_g)) {
90 rep = list_entry(rpcrdma_tasklets_g.next,
91 struct rpcrdma_rep, rr_list);
92 list_del(&rep->rr_list);
93 func = rep->rr_func;
94 rep->rr_func = NULL;
95 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
96
97 if (func)
98 func(rep);
99 else
100 rpcrdma_recv_buffer_put(rep);
101
102 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
103 }
104 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
105}
106
107static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
108
7ff11de1
CL
109static const char * const async_event[] = {
110 "CQ error",
111 "QP fatal error",
112 "QP request error",
113 "QP access error",
114 "communication established",
115 "send queue drained",
116 "path migration successful",
117 "path mig error",
118 "device fatal error",
119 "port active",
120 "port error",
121 "LID change",
122 "P_key change",
123 "SM change",
124 "SRQ error",
125 "SRQ limit reached",
126 "last WQE reached",
127 "client reregister",
128 "GID change",
129};
130
131#define ASYNC_MSG(status) \
132 ((status) < ARRAY_SIZE(async_event) ? \
133 async_event[(status)] : "unknown async error")
134
f1a03b76
CL
135static void
136rpcrdma_schedule_tasklet(struct list_head *sched_list)
137{
138 unsigned long flags;
139
140 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
141 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
142 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
143 tasklet_schedule(&rpcrdma_tasklet_g);
144}
145
c56c65fb
TT
146static void
147rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
148{
149 struct rpcrdma_ep *ep = context;
150
7ff11de1
CL
151 pr_err("RPC: %s: %s on device %s ep %p\n",
152 __func__, ASYNC_MSG(event->event),
153 event->device->name, context);
c56c65fb
TT
154 if (ep->rep_connected == 1) {
155 ep->rep_connected = -EIO;
156 ep->rep_func(ep);
157 wake_up_all(&ep->rep_connect_wait);
158 }
159}
160
161static void
162rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
163{
164 struct rpcrdma_ep *ep = context;
165
7ff11de1
CL
166 pr_err("RPC: %s: %s on device %s ep %p\n",
167 __func__, ASYNC_MSG(event->event),
168 event->device->name, context);
c56c65fb
TT
169 if (ep->rep_connected == 1) {
170 ep->rep_connected = -EIO;
171 ep->rep_func(ep);
172 wake_up_all(&ep->rep_connect_wait);
173 }
174}
175
fc664485
CL
176static void
177rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 178{
fc664485 179 struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
c56c65fb 180
fc664485
CL
181 dprintk("RPC: %s: frmr %p status %X opcode %d\n",
182 __func__, frmr, wc->status, wc->opcode);
c56c65fb 183
fc664485 184 if (wc->wr_id == 0ULL)
c56c65fb 185 return;
dab7e3b8 186 if (wc->status != IB_WC_SUCCESS)
9f9d802a 187 frmr->r.frmr.fr_state = FRMR_IS_STALE;
c56c65fb
TT
188}
189
fc664485 190static int
1c00dd07 191rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 192{
1c00dd07 193 struct ib_wc *wcs;
8301a2c0 194 int budget, count, rc;
c56c65fb 195
8301a2c0 196 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
197 do {
198 wcs = ep->rep_send_wcs;
199
200 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
201 if (rc <= 0)
202 return rc;
203
204 count = rc;
205 while (count-- > 0)
206 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 207 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 208 return 0;
fc664485 209}
c56c65fb 210
fc664485
CL
211/*
212 * Handle send, fast_reg_mr, and local_inv completions.
213 *
214 * Send events are typically suppressed and thus do not result
215 * in an upcall. Occasionally one is signaled, however. This
216 * prevents the provider's completion queue from wrapping and
217 * losing a completion.
218 */
219static void
220rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
221{
1c00dd07 222 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
223 int rc;
224
1c00dd07 225 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
226 if (rc) {
227 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
228 __func__, rc);
229 return;
c56c65fb
TT
230 }
231
7f23f6f6
CL
232 rc = ib_req_notify_cq(cq,
233 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
234 if (rc == 0)
235 return;
236 if (rc < 0) {
fc664485
CL
237 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
238 __func__, rc);
239 return;
240 }
241
1c00dd07 242 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
243}
244
245static void
bb96193d 246rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
fc664485
CL
247{
248 struct rpcrdma_rep *rep =
249 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
250
251 dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
252 __func__, rep, wc->status, wc->opcode, wc->byte_len);
253
254 if (wc->status != IB_WC_SUCCESS) {
255 rep->rr_len = ~0U;
256 goto out_schedule;
257 }
258 if (wc->opcode != IB_WC_RECV)
259 return;
260
261 rep->rr_len = wc->byte_len;
262 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
263 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
264
265 if (rep->rr_len >= 16) {
266 struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
267 unsigned int credits = ntohl(p->rm_credit);
268
269 if (credits == 0)
270 credits = 1; /* don't deadlock */
271 else if (credits > rep->rr_buffer->rb_max_requests)
272 credits = rep->rr_buffer->rb_max_requests;
273 atomic_set(&rep->rr_buffer->rb_credits, credits);
274 }
275
276out_schedule:
bb96193d 277 list_add_tail(&rep->rr_list, sched_list);
fc664485
CL
278}
279
280static int
1c00dd07 281rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 282{
bb96193d 283 struct list_head sched_list;
1c00dd07 284 struct ib_wc *wcs;
8301a2c0 285 int budget, count, rc;
fc664485 286
bb96193d 287 INIT_LIST_HEAD(&sched_list);
8301a2c0 288 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
289 do {
290 wcs = ep->rep_recv_wcs;
291
292 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
293 if (rc <= 0)
bb96193d 294 goto out_schedule;
1c00dd07
CL
295
296 count = rc;
297 while (count-- > 0)
bb96193d 298 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
8301a2c0 299 } while (rc == RPCRDMA_POLLSIZE && --budget);
bb96193d
CL
300 rc = 0;
301
302out_schedule:
f1a03b76 303 rpcrdma_schedule_tasklet(&sched_list);
bb96193d 304 return rc;
c56c65fb
TT
305}
306
307/*
fc664485 308 * Handle receive completions.
c56c65fb 309 *
c56c65fb
TT
310 * It is reentrant but processes single events in order to maintain
311 * ordering of receives to keep server credits.
312 *
313 * It is the responsibility of the scheduled tasklet to return
314 * recv buffers to the pool. NOTE: this affects synchronization of
315 * connection shutdown. That is, the structures required for
316 * the completion of the reply handler must remain intact until
317 * all memory has been reclaimed.
c56c65fb
TT
318 */
319static void
fc664485 320rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 321{
1c00dd07 322 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
323 int rc;
324
1c00dd07 325 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
326 if (rc) {
327 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
328 __func__, rc);
c56c65fb 329 return;
fc664485 330 }
c56c65fb 331
7f23f6f6
CL
332 rc = ib_req_notify_cq(cq,
333 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
334 if (rc == 0)
335 return;
336 if (rc < 0) {
fc664485 337 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
338 __func__, rc);
339 return;
340 }
341
1c00dd07 342 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
343}
344
a7bc211a
CL
345static void
346rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
347{
5c166bef
CL
348 struct ib_wc wc;
349 LIST_HEAD(sched_list);
350
351 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
352 rpcrdma_recvcq_process_wc(&wc, &sched_list);
353 if (!list_empty(&sched_list))
354 rpcrdma_schedule_tasklet(&sched_list);
355 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
356 rpcrdma_sendcq_process_wc(&wc);
a7bc211a
CL
357}
358
f895b252 359#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
360static const char * const conn[] = {
361 "address resolved",
362 "address error",
363 "route resolved",
364 "route error",
365 "connect request",
366 "connect response",
367 "connect error",
368 "unreachable",
369 "rejected",
370 "established",
371 "disconnected",
8079fb78
CL
372 "device removal",
373 "multicast join",
374 "multicast error",
375 "address change",
376 "timewait exit",
c56c65fb 377};
8079fb78
CL
378
379#define CONNECTION_MSG(status) \
380 ((status) < ARRAY_SIZE(conn) ? \
381 conn[(status)] : "unrecognized connection error")
c56c65fb
TT
382#endif
383
384static int
385rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
386{
387 struct rpcrdma_xprt *xprt = id->context;
388 struct rpcrdma_ia *ia = &xprt->rx_ia;
389 struct rpcrdma_ep *ep = &xprt->rx_ep;
f895b252 390#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb 391 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 392#endif
c56c65fb
TT
393 struct ib_qp_attr attr;
394 struct ib_qp_init_attr iattr;
395 int connstate = 0;
396
397 switch (event->event) {
398 case RDMA_CM_EVENT_ADDR_RESOLVED:
399 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 400 ia->ri_async_rc = 0;
c56c65fb
TT
401 complete(&ia->ri_done);
402 break;
403 case RDMA_CM_EVENT_ADDR_ERROR:
404 ia->ri_async_rc = -EHOSTUNREACH;
405 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
406 __func__, ep);
407 complete(&ia->ri_done);
408 break;
409 case RDMA_CM_EVENT_ROUTE_ERROR:
410 ia->ri_async_rc = -ENETUNREACH;
411 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
412 __func__, ep);
413 complete(&ia->ri_done);
414 break;
415 case RDMA_CM_EVENT_ESTABLISHED:
416 connstate = 1;
417 ib_query_qp(ia->ri_id->qp, &attr,
418 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
419 &iattr);
420 dprintk("RPC: %s: %d responder resources"
421 " (%d initiator)\n",
422 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
423 goto connected;
424 case RDMA_CM_EVENT_CONNECT_ERROR:
425 connstate = -ENOTCONN;
426 goto connected;
427 case RDMA_CM_EVENT_UNREACHABLE:
428 connstate = -ENETDOWN;
429 goto connected;
430 case RDMA_CM_EVENT_REJECTED:
431 connstate = -ECONNREFUSED;
432 goto connected;
433 case RDMA_CM_EVENT_DISCONNECTED:
434 connstate = -ECONNABORTED;
435 goto connected;
436 case RDMA_CM_EVENT_DEVICE_REMOVAL:
437 connstate = -ENODEV;
438connected:
c56c65fb
TT
439 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
440 dprintk("RPC: %s: %sconnected\n",
441 __func__, connstate > 0 ? "" : "dis");
442 ep->rep_connected = connstate;
443 ep->rep_func(ep);
444 wake_up_all(&ep->rep_connect_wait);
8079fb78 445 /*FALLTHROUGH*/
c56c65fb 446 default:
8079fb78
CL
447 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
448 __func__, &addr->sin_addr.s_addr,
449 ntohs(addr->sin_port), ep,
450 CONNECTION_MSG(event->event));
c56c65fb
TT
451 break;
452 }
453
f895b252 454#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
b3cd8d45
TT
455 if (connstate == 1) {
456 int ird = attr.max_dest_rd_atomic;
457 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 458 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 459 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 460 &addr->sin_addr.s_addr,
b3cd8d45
TT
461 ntohs(addr->sin_port),
462 ia->ri_id->device->name,
463 ia->ri_memreg_strategy,
464 xprt->rx_buf.rb_max_requests,
465 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
466 } else if (connstate < 0) {
21454aaa
HH
467 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
468 &addr->sin_addr.s_addr,
b3cd8d45
TT
469 ntohs(addr->sin_port),
470 connstate);
471 }
472#endif
473
c56c65fb
TT
474 return 0;
475}
476
477static struct rdma_cm_id *
478rpcrdma_create_id(struct rpcrdma_xprt *xprt,
479 struct rpcrdma_ia *ia, struct sockaddr *addr)
480{
481 struct rdma_cm_id *id;
482 int rc;
483
1a954051
TT
484 init_completion(&ia->ri_done);
485
b26f9b99 486 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
487 if (IS_ERR(id)) {
488 rc = PTR_ERR(id);
489 dprintk("RPC: %s: rdma_create_id() failed %i\n",
490 __func__, rc);
491 return id;
492 }
493
5675add3 494 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
495 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
496 if (rc) {
497 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
498 __func__, rc);
499 goto out;
500 }
5675add3
TT
501 wait_for_completion_interruptible_timeout(&ia->ri_done,
502 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
503 rc = ia->ri_async_rc;
504 if (rc)
505 goto out;
506
5675add3 507 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
508 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
509 if (rc) {
510 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
511 __func__, rc);
512 goto out;
513 }
5675add3
TT
514 wait_for_completion_interruptible_timeout(&ia->ri_done,
515 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
516 rc = ia->ri_async_rc;
517 if (rc)
518 goto out;
519
520 return id;
521
522out:
523 rdma_destroy_id(id);
524 return ERR_PTR(rc);
525}
526
527/*
528 * Drain any cq, prior to teardown.
529 */
530static void
531rpcrdma_clean_cq(struct ib_cq *cq)
532{
533 struct ib_wc wc;
534 int count = 0;
535
536 while (1 == ib_poll_cq(cq, 1, &wc))
537 ++count;
538
539 if (count)
540 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
541 __func__, count, wc.opcode);
542}
543
544/*
545 * Exported functions.
546 */
547
548/*
549 * Open and initialize an Interface Adapter.
550 * o initializes fields of struct rpcrdma_ia, including
551 * interface and provider attributes and protection zone.
552 */
553int
554rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
555{
bd7ed1d1
TT
556 int rc, mem_priv;
557 struct ib_device_attr devattr;
c56c65fb
TT
558 struct rpcrdma_ia *ia = &xprt->rx_ia;
559
c56c65fb
TT
560 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
561 if (IS_ERR(ia->ri_id)) {
562 rc = PTR_ERR(ia->ri_id);
563 goto out1;
564 }
565
566 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
567 if (IS_ERR(ia->ri_pd)) {
568 rc = PTR_ERR(ia->ri_pd);
569 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
570 __func__, rc);
571 goto out2;
572 }
573
bd7ed1d1
TT
574 /*
575 * Query the device to determine if the requested memory
576 * registration strategy is supported. If it isn't, set the
577 * strategy to a globally supported model.
578 */
579 rc = ib_query_device(ia->ri_id->device, &devattr);
580 if (rc) {
581 dprintk("RPC: %s: ib_query_device failed %d\n",
582 __func__, rc);
583 goto out2;
584 }
585
586 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
587 ia->ri_have_dma_lkey = 1;
588 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
589 }
590
f10eafd3 591 if (memreg == RPCRDMA_FRMR) {
3197d309
TT
592 /* Requires both frmr reg and local dma lkey */
593 if ((devattr.device_cap_flags &
594 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
595 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 596 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
597 "not supported by HCA\n", __func__);
598 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
599 } else {
600 /* Mind the ia limit on FRMR page list depth */
601 ia->ri_max_frmr_depth = min_t(unsigned int,
602 RPCRDMA_MAX_DATA_SEGS,
603 devattr.max_fast_reg_page_list_len);
bd7ed1d1 604 }
f10eafd3
CL
605 }
606 if (memreg == RPCRDMA_MTHCAFMR) {
607 if (!ia->ri_id->device->alloc_fmr) {
608 dprintk("RPC: %s: MTHCAFMR registration "
609 "not supported by HCA\n", __func__);
f10eafd3 610 memreg = RPCRDMA_ALLPHYSICAL;
f10eafd3 611 }
bd7ed1d1
TT
612 }
613
c56c65fb
TT
614 /*
615 * Optionally obtain an underlying physical identity mapping in
616 * order to do a memory window-based bind. This base registration
617 * is protected from remote access - that is enabled only by binding
618 * for the specific bytes targeted during each RPC operation, and
619 * revoked after the corresponding completion similar to a storage
620 * adapter.
621 */
bd7ed1d1 622 switch (memreg) {
3197d309 623 case RPCRDMA_FRMR:
bd7ed1d1 624 break;
bd7ed1d1
TT
625 case RPCRDMA_ALLPHYSICAL:
626 mem_priv = IB_ACCESS_LOCAL_WRITE |
627 IB_ACCESS_REMOTE_WRITE |
628 IB_ACCESS_REMOTE_READ;
629 goto register_setup;
bd7ed1d1
TT
630 case RPCRDMA_MTHCAFMR:
631 if (ia->ri_have_dma_lkey)
c56c65fb 632 break;
bd7ed1d1
TT
633 mem_priv = IB_ACCESS_LOCAL_WRITE;
634 register_setup:
c56c65fb
TT
635 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
636 if (IS_ERR(ia->ri_bind_mem)) {
637 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 638 "phys register failed with %lX\n",
c56c65fb 639 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1
CL
640 rc = -ENOMEM;
641 goto out2;
c56c65fb 642 }
bd7ed1d1
TT
643 break;
644 default:
cdd9ade7
CL
645 printk(KERN_ERR "RPC: Unsupported memory "
646 "registration mode: %d\n", memreg);
647 rc = -ENOMEM;
bd7ed1d1 648 goto out2;
c56c65fb 649 }
bd7ed1d1
TT
650 dprintk("RPC: %s: memory registration strategy is %d\n",
651 __func__, memreg);
c56c65fb
TT
652
653 /* Else will do memory reg/dereg for each chunk */
654 ia->ri_memreg_strategy = memreg;
655
73806c88 656 rwlock_init(&ia->ri_qplock);
c56c65fb
TT
657 return 0;
658out2:
659 rdma_destroy_id(ia->ri_id);
fee08caf 660 ia->ri_id = NULL;
c56c65fb
TT
661out1:
662 return rc;
663}
664
665/*
666 * Clean up/close an IA.
667 * o if event handles and PD have been initialized, free them.
668 * o close the IA
669 */
670void
671rpcrdma_ia_close(struct rpcrdma_ia *ia)
672{
673 int rc;
674
675 dprintk("RPC: %s: entering\n", __func__);
676 if (ia->ri_bind_mem != NULL) {
677 rc = ib_dereg_mr(ia->ri_bind_mem);
678 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
679 __func__, rc);
680 }
fee08caf
TT
681 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
682 if (ia->ri_id->qp)
683 rdma_destroy_qp(ia->ri_id);
684 rdma_destroy_id(ia->ri_id);
685 ia->ri_id = NULL;
686 }
c56c65fb
TT
687 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
688 rc = ib_dealloc_pd(ia->ri_pd);
689 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
690 __func__, rc);
691 }
c56c65fb
TT
692}
693
694/*
695 * Create unconnected endpoint.
696 */
697int
698rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
699 struct rpcrdma_create_data_internal *cdata)
700{
701 struct ib_device_attr devattr;
fc664485 702 struct ib_cq *sendcq, *recvcq;
5d40a8a5 703 int rc, err;
c56c65fb
TT
704
705 rc = ib_query_device(ia->ri_id->device, &devattr);
706 if (rc) {
707 dprintk("RPC: %s: ib_query_device failed %d\n",
708 __func__, rc);
709 return rc;
710 }
711
712 /* check provider's send/recv wr limits */
713 if (cdata->max_requests > devattr.max_qp_wr)
714 cdata->max_requests = devattr.max_qp_wr;
715
716 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
717 ep->rep_attr.qp_context = ep;
718 /* send_cq and recv_cq initialized below */
719 ep->rep_attr.srq = NULL;
720 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
721 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
722 case RPCRDMA_FRMR: {
723 int depth = 7;
724
15cdc644
TT
725 /* Add room for frmr register and invalidate WRs.
726 * 1. FRMR reg WR for head
727 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
728 * 3. N FRMR reg WRs for pagelist
729 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
730 * 5. FRMR reg WR for tail
731 * 6. FRMR invalidate WR for tail
732 * 7. The RDMA_SEND WR
733 */
0fc6c4e7
SW
734
735 /* Calculate N if the device max FRMR depth is smaller than
736 * RPCRDMA_MAX_DATA_SEGS.
737 */
738 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
739 int delta = RPCRDMA_MAX_DATA_SEGS -
740 ia->ri_max_frmr_depth;
741
742 do {
743 depth += 2; /* FRMR reg + invalidate */
744 delta -= ia->ri_max_frmr_depth;
745 } while (delta > 0);
746
747 }
748 ep->rep_attr.cap.max_send_wr *= depth;
15cdc644 749 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
0fc6c4e7 750 cdata->max_requests = devattr.max_qp_wr / depth;
15cdc644
TT
751 if (!cdata->max_requests)
752 return -EINVAL;
0fc6c4e7
SW
753 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
754 depth;
15cdc644 755 }
3197d309 756 break;
0fc6c4e7 757 }
c56c65fb
TT
758 default:
759 break;
760 }
761 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
762 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
763 ep->rep_attr.cap.max_recv_sge = 1;
764 ep->rep_attr.cap.max_inline_data = 0;
765 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
766 ep->rep_attr.qp_type = IB_QPT_RC;
767 ep->rep_attr.port_num = ~0;
768
769 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
770 "iovs: send %d recv %d\n",
771 __func__,
772 ep->rep_attr.cap.max_send_wr,
773 ep->rep_attr.cap.max_recv_wr,
774 ep->rep_attr.cap.max_send_sge,
775 ep->rep_attr.cap.max_recv_sge);
776
777 /* set trigger for requesting send completion */
fc664485 778 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
e7104a2a
CL
779 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
780 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
781 else if (ep->rep_cqinit <= 2)
c56c65fb
TT
782 ep->rep_cqinit = 0;
783 INIT_CQCOUNT(ep);
784 ep->rep_ia = ia;
785 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 786 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 787
fc664485 788 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 789 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 790 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
791 if (IS_ERR(sendcq)) {
792 rc = PTR_ERR(sendcq);
793 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
794 __func__, rc);
795 goto out1;
796 }
797
fc664485 798 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
799 if (rc) {
800 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
801 __func__, rc);
802 goto out2;
803 }
804
fc664485 805 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 806 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
807 ep->rep_attr.cap.max_recv_wr + 1, 0);
808 if (IS_ERR(recvcq)) {
809 rc = PTR_ERR(recvcq);
810 dprintk("RPC: %s: failed to create recv CQ: %i\n",
811 __func__, rc);
812 goto out2;
813 }
814
815 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
816 if (rc) {
817 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
818 __func__, rc);
819 ib_destroy_cq(recvcq);
820 goto out2;
821 }
822
823 ep->rep_attr.send_cq = sendcq;
824 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
825
826 /* Initialize cma parameters */
827
828 /* RPC/RDMA does not use private data */
829 ep->rep_remote_cma.private_data = NULL;
830 ep->rep_remote_cma.private_data_len = 0;
831
832 /* Client offers RDMA Read but does not initiate */
b334eaab 833 ep->rep_remote_cma.initiator_depth = 0;
03ff8821 834 if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
835 ep->rep_remote_cma.responder_resources = 32;
836 else
c56c65fb 837 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
c56c65fb
TT
838
839 ep->rep_remote_cma.retry_count = 7;
840 ep->rep_remote_cma.flow_control = 0;
841 ep->rep_remote_cma.rnr_retry_count = 0;
842
843 return 0;
844
845out2:
fc664485 846 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
847 if (err)
848 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
849 __func__, err);
c56c65fb
TT
850out1:
851 return rc;
852}
853
854/*
855 * rpcrdma_ep_destroy
856 *
857 * Disconnect and destroy endpoint. After this, the only
858 * valid operations on the ep are to free it (if dynamically
859 * allocated) or re-create it.
c56c65fb 860 */
7f1d5419 861void
c56c65fb
TT
862rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
863{
864 int rc;
865
866 dprintk("RPC: %s: entering, connected is %d\n",
867 __func__, ep->rep_connected);
868
254f91e2
CL
869 cancel_delayed_work_sync(&ep->rep_connect_worker);
870
c56c65fb 871 if (ia->ri_id->qp) {
282191cb 872 rpcrdma_ep_disconnect(ep, ia);
fee08caf
TT
873 rdma_destroy_qp(ia->ri_id);
874 ia->ri_id->qp = NULL;
c56c65fb
TT
875 }
876
c56c65fb
TT
877 /* padding - could be done in rpcrdma_buffer_destroy... */
878 if (ep->rep_pad_mr) {
879 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
880 ep->rep_pad_mr = NULL;
881 }
882
fc664485
CL
883 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
884 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
885 if (rc)
886 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
887 __func__, rc);
888
889 rpcrdma_clean_cq(ep->rep_attr.send_cq);
890 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
891 if (rc)
892 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
893 __func__, rc);
c56c65fb
TT
894}
895
896/*
897 * Connect unconnected endpoint.
898 */
899int
900rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
901{
73806c88 902 struct rdma_cm_id *id, *old;
c56c65fb
TT
903 int rc = 0;
904 int retry_count = 0;
c56c65fb 905
c055551e 906 if (ep->rep_connected != 0) {
c56c65fb
TT
907 struct rpcrdma_xprt *xprt;
908retry:
ec62f40d 909 dprintk("RPC: %s: reconnecting...\n", __func__);
282191cb
CL
910
911 rpcrdma_ep_disconnect(ep, ia);
a7bc211a 912 rpcrdma_flush_cqs(ep);
c56c65fb 913
467c9674
CL
914 switch (ia->ri_memreg_strategy) {
915 case RPCRDMA_FRMR:
9f9d802a 916 rpcrdma_reset_frmrs(ia);
467c9674
CL
917 break;
918 case RPCRDMA_MTHCAFMR:
919 rpcrdma_reset_fmrs(ia);
920 break;
921 case RPCRDMA_ALLPHYSICAL:
922 break;
923 default:
924 rc = -EIO;
925 goto out;
926 }
9f9d802a 927
c56c65fb
TT
928 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
929 id = rpcrdma_create_id(xprt, ia,
930 (struct sockaddr *)&xprt->rx_data.addr);
931 if (IS_ERR(id)) {
ec62f40d 932 rc = -EHOSTUNREACH;
c56c65fb
TT
933 goto out;
934 }
935 /* TEMP TEMP TEMP - fail if new device:
936 * Deregister/remarshal *all* requests!
937 * Close and recreate adapter, pd, etc!
938 * Re-determine all attributes still sane!
939 * More stuff I haven't thought of!
940 * Rrrgh!
941 */
942 if (ia->ri_id->device != id->device) {
943 printk("RPC: %s: can't reconnect on "
944 "different device!\n", __func__);
945 rdma_destroy_id(id);
ec62f40d 946 rc = -ENETUNREACH;
c56c65fb
TT
947 goto out;
948 }
949 /* END TEMP */
ec62f40d
CL
950 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
951 if (rc) {
952 dprintk("RPC: %s: rdma_create_qp failed %i\n",
953 __func__, rc);
954 rdma_destroy_id(id);
955 rc = -ENETUNREACH;
956 goto out;
957 }
73806c88
CL
958
959 write_lock(&ia->ri_qplock);
960 old = ia->ri_id;
c56c65fb 961 ia->ri_id = id;
73806c88
CL
962 write_unlock(&ia->ri_qplock);
963
964 rdma_destroy_qp(old);
965 rdma_destroy_id(old);
ec62f40d
CL
966 } else {
967 dprintk("RPC: %s: connecting...\n", __func__);
968 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
969 if (rc) {
970 dprintk("RPC: %s: rdma_create_qp failed %i\n",
971 __func__, rc);
972 /* do not update ep->rep_connected */
973 return -ENETUNREACH;
974 }
c56c65fb
TT
975 }
976
c56c65fb
TT
977 ep->rep_connected = 0;
978
979 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
980 if (rc) {
981 dprintk("RPC: %s: rdma_connect() failed with %i\n",
982 __func__, rc);
983 goto out;
984 }
985
c56c65fb
TT
986 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
987
988 /*
989 * Check state. A non-peer reject indicates no listener
990 * (ECONNREFUSED), which may be a transient state. All
991 * others indicate a transport condition which has already
992 * undergone a best-effort.
993 */
f64f9e71
JP
994 if (ep->rep_connected == -ECONNREFUSED &&
995 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
996 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
997 goto retry;
998 }
999 if (ep->rep_connected <= 0) {
1000 /* Sometimes, the only way to reliably connect to remote
1001 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
1002 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1003 (ep->rep_remote_cma.responder_resources == 0 ||
1004 ep->rep_remote_cma.initiator_depth !=
1005 ep->rep_remote_cma.responder_resources)) {
1006 if (ep->rep_remote_cma.responder_resources == 0)
1007 ep->rep_remote_cma.responder_resources = 1;
1008 ep->rep_remote_cma.initiator_depth =
1009 ep->rep_remote_cma.responder_resources;
c56c65fb 1010 goto retry;
b334eaab 1011 }
c56c65fb
TT
1012 rc = ep->rep_connected;
1013 } else {
1014 dprintk("RPC: %s: connected\n", __func__);
1015 }
1016
1017out:
1018 if (rc)
1019 ep->rep_connected = rc;
1020 return rc;
1021}
1022
1023/*
1024 * rpcrdma_ep_disconnect
1025 *
1026 * This is separate from destroy to facilitate the ability
1027 * to reconnect without recreating the endpoint.
1028 *
1029 * This call is not reentrant, and must not be made in parallel
1030 * on the same endpoint.
1031 */
282191cb 1032void
c56c65fb
TT
1033rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1034{
1035 int rc;
1036
a7bc211a 1037 rpcrdma_flush_cqs(ep);
c56c65fb
TT
1038 rc = rdma_disconnect(ia->ri_id);
1039 if (!rc) {
1040 /* returns without wait if not connected */
1041 wait_event_interruptible(ep->rep_connect_wait,
1042 ep->rep_connected != 1);
1043 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1044 (ep->rep_connected == 1) ? "still " : "dis");
1045 } else {
1046 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1047 ep->rep_connected = rc;
1048 }
c56c65fb
TT
1049}
1050
2e84522c
CL
1051static int
1052rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1053{
1054 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1055 struct ib_fmr_attr fmr_attr = {
1056 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1057 .max_maps = 1,
1058 .page_shift = PAGE_SHIFT
1059 };
1060 struct rpcrdma_mw *r;
1061 int i, rc;
1062
1063 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1064 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1065
1066 while (i--) {
1067 r = kzalloc(sizeof(*r), GFP_KERNEL);
1068 if (r == NULL)
1069 return -ENOMEM;
1070
1071 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1072 if (IS_ERR(r->r.fmr)) {
1073 rc = PTR_ERR(r->r.fmr);
1074 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1075 __func__, rc);
1076 goto out_free;
1077 }
1078
1079 list_add(&r->mw_list, &buf->rb_mws);
1080 list_add(&r->mw_all, &buf->rb_all);
1081 }
1082 return 0;
1083
1084out_free:
1085 kfree(r);
1086 return rc;
1087}
1088
1089static int
1090rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1091{
1092 struct rpcrdma_frmr *f;
1093 struct rpcrdma_mw *r;
1094 int i, rc;
1095
1096 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1097 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1098
1099 while (i--) {
1100 r = kzalloc(sizeof(*r), GFP_KERNEL);
1101 if (r == NULL)
1102 return -ENOMEM;
1103 f = &r->r.frmr;
1104
1105 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1106 ia->ri_max_frmr_depth);
1107 if (IS_ERR(f->fr_mr)) {
1108 rc = PTR_ERR(f->fr_mr);
1109 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1110 "failed %i\n", __func__, rc);
1111 goto out_free;
1112 }
1113
1114 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1115 ia->ri_max_frmr_depth);
1116 if (IS_ERR(f->fr_pgl)) {
1117 rc = PTR_ERR(f->fr_pgl);
1118 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1119 "failed %i\n", __func__, rc);
1120
1121 ib_dereg_mr(f->fr_mr);
1122 goto out_free;
1123 }
1124
1125 list_add(&r->mw_list, &buf->rb_mws);
1126 list_add(&r->mw_all, &buf->rb_all);
1127 }
1128
1129 return 0;
1130
1131out_free:
1132 kfree(r);
1133 return rc;
1134}
1135
c56c65fb
TT
1136int
1137rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
1138 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
1139{
1140 char *p;
65866f82 1141 size_t len, rlen, wlen;
c56c65fb
TT
1142 int i, rc;
1143
1144 buf->rb_max_requests = cdata->max_requests;
1145 spin_lock_init(&buf->rb_lock);
1146 atomic_set(&buf->rb_credits, 1);
1147
1148 /* Need to allocate:
1149 * 1. arrays for send and recv pointers
1150 * 2. arrays of struct rpcrdma_req to fill in pointers
1151 * 3. array of struct rpcrdma_rep for replies
1152 * 4. padding, if any
c56c65fb
TT
1153 * Send/recv buffers in req/rep need to be registered
1154 */
c56c65fb
TT
1155 len = buf->rb_max_requests *
1156 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1157 len += cdata->padding;
c56c65fb 1158
c56c65fb
TT
1159 p = kzalloc(len, GFP_KERNEL);
1160 if (p == NULL) {
1161 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1162 __func__, len);
1163 rc = -ENOMEM;
1164 goto out;
1165 }
1166 buf->rb_pool = p; /* for freeing it later */
1167
1168 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1169 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1170 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1171 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1172
1173 /*
1174 * Register the zeroed pad buffer, if any.
1175 */
1176 if (cdata->padding) {
1177 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1178 &ep->rep_pad_mr, &ep->rep_pad);
1179 if (rc)
1180 goto out;
1181 }
1182 p += cdata->padding;
1183
c56c65fb 1184 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1185 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1186 switch (ia->ri_memreg_strategy) {
3197d309 1187 case RPCRDMA_FRMR:
2e84522c
CL
1188 rc = rpcrdma_init_frmrs(ia, buf);
1189 if (rc)
1190 goto out;
3197d309 1191 break;
c56c65fb 1192 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1193 rc = rpcrdma_init_fmrs(ia, buf);
1194 if (rc)
1195 goto out;
c56c65fb 1196 break;
c56c65fb
TT
1197 default:
1198 break;
1199 }
1200
1201 /*
1202 * Allocate/init the request/reply buffers. Doing this
1203 * using kmalloc for now -- one for each buf.
1204 */
65866f82
CL
1205 wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
1206 rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
1207 dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
1208 __func__, wlen, rlen);
1209
c56c65fb
TT
1210 for (i = 0; i < buf->rb_max_requests; i++) {
1211 struct rpcrdma_req *req;
1212 struct rpcrdma_rep *rep;
1213
65866f82 1214 req = kmalloc(wlen, GFP_KERNEL);
c56c65fb
TT
1215 if (req == NULL) {
1216 dprintk("RPC: %s: request buffer %d alloc"
1217 " failed\n", __func__, i);
1218 rc = -ENOMEM;
1219 goto out;
1220 }
1221 memset(req, 0, sizeof(struct rpcrdma_req));
1222 buf->rb_send_bufs[i] = req;
1223 buf->rb_send_bufs[i]->rl_buffer = buf;
1224
1225 rc = rpcrdma_register_internal(ia, req->rl_base,
65866f82 1226 wlen - offsetof(struct rpcrdma_req, rl_base),
c56c65fb
TT
1227 &buf->rb_send_bufs[i]->rl_handle,
1228 &buf->rb_send_bufs[i]->rl_iov);
1229 if (rc)
1230 goto out;
1231
65866f82
CL
1232 buf->rb_send_bufs[i]->rl_size = wlen -
1233 sizeof(struct rpcrdma_req);
c56c65fb 1234
65866f82 1235 rep = kmalloc(rlen, GFP_KERNEL);
c56c65fb
TT
1236 if (rep == NULL) {
1237 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1238 __func__, i);
1239 rc = -ENOMEM;
1240 goto out;
1241 }
1242 memset(rep, 0, sizeof(struct rpcrdma_rep));
1243 buf->rb_recv_bufs[i] = rep;
1244 buf->rb_recv_bufs[i]->rr_buffer = buf;
c56c65fb
TT
1245
1246 rc = rpcrdma_register_internal(ia, rep->rr_base,
65866f82 1247 rlen - offsetof(struct rpcrdma_rep, rr_base),
c56c65fb
TT
1248 &buf->rb_recv_bufs[i]->rr_handle,
1249 &buf->rb_recv_bufs[i]->rr_iov);
1250 if (rc)
1251 goto out;
1252
1253 }
1254 dprintk("RPC: %s: max_requests %d\n",
1255 __func__, buf->rb_max_requests);
1256 /* done */
1257 return 0;
1258out:
1259 rpcrdma_buffer_destroy(buf);
1260 return rc;
1261}
1262
2e84522c
CL
1263static void
1264rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1265{
1266 struct rpcrdma_mw *r;
1267 int rc;
1268
1269 while (!list_empty(&buf->rb_all)) {
1270 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1271 list_del(&r->mw_all);
1272 list_del(&r->mw_list);
1273
1274 rc = ib_dealloc_fmr(r->r.fmr);
1275 if (rc)
1276 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1277 __func__, rc);
1278
1279 kfree(r);
1280 }
1281}
1282
1283static void
1284rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1285{
1286 struct rpcrdma_mw *r;
1287 int rc;
1288
1289 while (!list_empty(&buf->rb_all)) {
1290 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1291 list_del(&r->mw_all);
1292 list_del(&r->mw_list);
1293
1294 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1295 if (rc)
1296 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1297 __func__, rc);
1298 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1299
1300 kfree(r);
1301 }
1302}
1303
c56c65fb
TT
1304void
1305rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1306{
c56c65fb 1307 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1308 int i;
c56c65fb
TT
1309
1310 /* clean up in reverse order from create
1311 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1312 * 2. send mr memory (mr free, then kfree)
2e84522c 1313 * 3. MWs
c56c65fb
TT
1314 */
1315 dprintk("RPC: %s: entering\n", __func__);
1316
1317 for (i = 0; i < buf->rb_max_requests; i++) {
1318 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1319 rpcrdma_deregister_internal(ia,
1320 buf->rb_recv_bufs[i]->rr_handle,
1321 &buf->rb_recv_bufs[i]->rr_iov);
1322 kfree(buf->rb_recv_bufs[i]);
1323 }
1324 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
c56c65fb
TT
1325 rpcrdma_deregister_internal(ia,
1326 buf->rb_send_bufs[i]->rl_handle,
1327 &buf->rb_send_bufs[i]->rl_iov);
1328 kfree(buf->rb_send_bufs[i]);
1329 }
1330 }
1331
2e84522c
CL
1332 switch (ia->ri_memreg_strategy) {
1333 case RPCRDMA_FRMR:
1334 rpcrdma_destroy_frmrs(buf);
1335 break;
1336 case RPCRDMA_MTHCAFMR:
1337 rpcrdma_destroy_fmrs(buf);
1338 break;
1339 default:
1340 break;
4034ba04
AA
1341 }
1342
c56c65fb
TT
1343 kfree(buf->rb_pool);
1344}
1345
467c9674
CL
1346/* After a disconnect, unmap all FMRs.
1347 *
1348 * This is invoked only in the transport connect worker in order
1349 * to serialize with rpcrdma_register_fmr_external().
1350 */
1351static void
1352rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1353{
1354 struct rpcrdma_xprt *r_xprt =
1355 container_of(ia, struct rpcrdma_xprt, rx_ia);
1356 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1357 struct list_head *pos;
1358 struct rpcrdma_mw *r;
1359 LIST_HEAD(l);
1360 int rc;
1361
1362 list_for_each(pos, &buf->rb_all) {
1363 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1364
1365 INIT_LIST_HEAD(&l);
1366 list_add(&r->r.fmr->list, &l);
1367 rc = ib_unmap_fmr(&l);
1368 if (rc)
1369 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1370 __func__, rc);
1371 }
1372}
1373
9f9d802a
CL
1374/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1375 * an unusable state. Find FRMRs in this state and dereg / reg
1376 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1377 * also torn down.
1378 *
1379 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1380 *
1381 * This is invoked only in the transport connect worker in order
1382 * to serialize with rpcrdma_register_frmr_external().
1383 */
1384static void
1385rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1386{
1387 struct rpcrdma_xprt *r_xprt =
1388 container_of(ia, struct rpcrdma_xprt, rx_ia);
1389 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1390 struct list_head *pos;
1391 struct rpcrdma_mw *r;
1392 int rc;
1393
1394 list_for_each(pos, &buf->rb_all) {
1395 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1396
1397 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1398 continue;
1399
1400 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1401 if (rc)
1402 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1403 __func__, rc);
1404 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1405
1406 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1407 ia->ri_max_frmr_depth);
1408 if (IS_ERR(r->r.frmr.fr_mr)) {
1409 rc = PTR_ERR(r->r.frmr.fr_mr);
1410 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1411 " failed %i\n", __func__, rc);
1412 continue;
1413 }
1414 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1415 ia->ri_id->device,
1416 ia->ri_max_frmr_depth);
1417 if (IS_ERR(r->r.frmr.fr_pgl)) {
1418 rc = PTR_ERR(r->r.frmr.fr_pgl);
1419 dprintk("RPC: %s: "
1420 "ib_alloc_fast_reg_page_list "
1421 "failed %i\n", __func__, rc);
1422
1423 ib_dereg_mr(r->r.frmr.fr_mr);
1424 continue;
1425 }
1426 r->r.frmr.fr_state = FRMR_IS_INVALID;
1427 }
1428}
1429
c2922c02
CL
1430/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1431 * some req segments uninitialized.
1432 */
1433static void
1434rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1435{
1436 if (*mw) {
1437 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1438 *mw = NULL;
1439 }
1440}
1441
1442/* Cycle mw's back in reverse order, and "spin" them.
1443 * This delays and scrambles reuse as much as possible.
1444 */
1445static void
1446rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1447{
1448 struct rpcrdma_mr_seg *seg = req->rl_segments;
1449 struct rpcrdma_mr_seg *seg1 = seg;
1450 int i;
1451
1452 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1453 rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf);
1454 rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf);
1455}
1456
1457static void
1458rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1459{
1460 buf->rb_send_bufs[--buf->rb_send_index] = req;
1461 req->rl_niovs = 0;
1462 if (req->rl_reply) {
1463 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1464 req->rl_reply->rr_func = NULL;
1465 req->rl_reply = NULL;
1466 }
1467}
1468
ddb6bebc
CL
1469/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1470 * Redo only the ib_post_send().
1471 */
1472static void
1473rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1474{
1475 struct rpcrdma_xprt *r_xprt =
1476 container_of(ia, struct rpcrdma_xprt, rx_ia);
1477 struct ib_send_wr invalidate_wr, *bad_wr;
1478 int rc;
1479
1480 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1481
1482 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1483 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1484
1485 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1486 invalidate_wr.wr_id = (unsigned long)(void *)r;
1487 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1488 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1489 DECR_CQCOUNT(&r_xprt->rx_ep);
1490
1491 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1492 __func__, r, r->r.frmr.fr_mr->rkey);
1493
1494 read_lock(&ia->ri_qplock);
1495 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1496 read_unlock(&ia->ri_qplock);
1497 if (rc) {
1498 /* Force rpcrdma_buffer_get() to retry */
1499 r->r.frmr.fr_state = FRMR_IS_STALE;
1500 dprintk("RPC: %s: ib_post_send failed, %i\n",
1501 __func__, rc);
1502 }
1503}
1504
1505static void
1506rpcrdma_retry_flushed_linv(struct list_head *stale,
1507 struct rpcrdma_buffer *buf)
1508{
1509 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1510 struct list_head *pos;
1511 struct rpcrdma_mw *r;
1512 unsigned long flags;
1513
1514 list_for_each(pos, stale) {
1515 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1516 rpcrdma_retry_local_inv(r, ia);
1517 }
1518
1519 spin_lock_irqsave(&buf->rb_lock, flags);
1520 list_splice_tail(stale, &buf->rb_mws);
1521 spin_unlock_irqrestore(&buf->rb_lock, flags);
1522}
1523
1524static struct rpcrdma_req *
1525rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1526 struct list_head *stale)
1527{
1528 struct rpcrdma_mw *r;
1529 int i;
1530
1531 i = RPCRDMA_MAX_SEGS - 1;
1532 while (!list_empty(&buf->rb_mws)) {
1533 r = list_entry(buf->rb_mws.next,
1534 struct rpcrdma_mw, mw_list);
1535 list_del(&r->mw_list);
1536 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1537 list_add(&r->mw_list, stale);
1538 continue;
1539 }
1540 req->rl_segments[i].mr_chunk.rl_mw = r;
1541 if (unlikely(i-- == 0))
1542 return req; /* Success */
1543 }
1544
1545 /* Not enough entries on rb_mws for this req */
1546 rpcrdma_buffer_put_sendbuf(req, buf);
1547 rpcrdma_buffer_put_mrs(req, buf);
1548 return NULL;
1549}
1550
c2922c02 1551static struct rpcrdma_req *
ddb6bebc 1552rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1553{
1554 struct rpcrdma_mw *r;
1555 int i;
1556
1557 i = RPCRDMA_MAX_SEGS - 1;
1558 while (!list_empty(&buf->rb_mws)) {
1559 r = list_entry(buf->rb_mws.next,
1560 struct rpcrdma_mw, mw_list);
1561 list_del(&r->mw_list);
1562 req->rl_segments[i].mr_chunk.rl_mw = r;
1563 if (unlikely(i-- == 0))
1564 return req; /* Success */
1565 }
1566
1567 /* Not enough entries on rb_mws for this req */
1568 rpcrdma_buffer_put_sendbuf(req, buf);
1569 rpcrdma_buffer_put_mrs(req, buf);
1570 return NULL;
1571}
1572
c56c65fb
TT
1573/*
1574 * Get a set of request/reply buffers.
1575 *
1576 * Reply buffer (if needed) is attached to send buffer upon return.
1577 * Rule:
1578 * rb_send_index and rb_recv_index MUST always be pointing to the
1579 * *next* available buffer (non-NULL). They are incremented after
1580 * removing buffers, and decremented *before* returning them.
1581 */
1582struct rpcrdma_req *
1583rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1584{
c2922c02 1585 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1586 struct list_head stale;
c56c65fb
TT
1587 struct rpcrdma_req *req;
1588 unsigned long flags;
1589
1590 spin_lock_irqsave(&buffers->rb_lock, flags);
1591 if (buffers->rb_send_index == buffers->rb_max_requests) {
1592 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1593 dprintk("RPC: %s: out of request buffers\n", __func__);
1594 return ((struct rpcrdma_req *)NULL);
1595 }
1596
1597 req = buffers->rb_send_bufs[buffers->rb_send_index];
1598 if (buffers->rb_send_index < buffers->rb_recv_index) {
1599 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1600 __func__,
1601 buffers->rb_recv_index - buffers->rb_send_index);
1602 req->rl_reply = NULL;
1603 } else {
1604 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1605 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1606 }
1607 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1608
1609 INIT_LIST_HEAD(&stale);
c2922c02
CL
1610 switch (ia->ri_memreg_strategy) {
1611 case RPCRDMA_FRMR:
ddb6bebc
CL
1612 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1613 break;
c2922c02 1614 case RPCRDMA_MTHCAFMR:
ddb6bebc 1615 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1616 break;
1617 default:
1618 break;
c56c65fb
TT
1619 }
1620 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1621 if (!list_empty(&stale))
1622 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1623 return req;
1624}
1625
1626/*
1627 * Put request/reply buffers back into pool.
1628 * Pre-decrement counter/array index.
1629 */
1630void
1631rpcrdma_buffer_put(struct rpcrdma_req *req)
1632{
1633 struct rpcrdma_buffer *buffers = req->rl_buffer;
1634 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1635 unsigned long flags;
1636
c56c65fb 1637 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1638 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1639 switch (ia->ri_memreg_strategy) {
3197d309 1640 case RPCRDMA_FRMR:
c56c65fb 1641 case RPCRDMA_MTHCAFMR:
c2922c02 1642 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1643 break;
1644 default:
1645 break;
1646 }
1647 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1648}
1649
1650/*
1651 * Recover reply buffers from pool.
1652 * This happens when recovering from error conditions.
1653 * Post-increment counter/array index.
1654 */
1655void
1656rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1657{
1658 struct rpcrdma_buffer *buffers = req->rl_buffer;
1659 unsigned long flags;
1660
1661 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1662 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1663 spin_lock_irqsave(&buffers->rb_lock, flags);
1664 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1665 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1666 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1667 }
1668 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1669}
1670
1671/*
1672 * Put reply buffers back into pool when not attached to
b45ccfd2 1673 * request. This happens in error conditions.
c56c65fb
TT
1674 */
1675void
1676rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1677{
1678 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1679 unsigned long flags;
1680
1681 rep->rr_func = NULL;
1682 spin_lock_irqsave(&buffers->rb_lock, flags);
1683 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1684 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1685}
1686
1687/*
1688 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1689 */
1690
1691int
1692rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1693 struct ib_mr **mrp, struct ib_sge *iov)
1694{
1695 struct ib_phys_buf ipb;
1696 struct ib_mr *mr;
1697 int rc;
1698
1699 /*
1700 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1701 */
1702 iov->addr = ib_dma_map_single(ia->ri_id->device,
1703 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1704 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1705 return -ENOMEM;
1706
c56c65fb
TT
1707 iov->length = len;
1708
bd7ed1d1
TT
1709 if (ia->ri_have_dma_lkey) {
1710 *mrp = NULL;
1711 iov->lkey = ia->ri_dma_lkey;
1712 return 0;
1713 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1714 *mrp = NULL;
1715 iov->lkey = ia->ri_bind_mem->lkey;
1716 return 0;
1717 }
1718
1719 ipb.addr = iov->addr;
1720 ipb.size = iov->length;
1721 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1722 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1723
1724 dprintk("RPC: %s: phys convert: 0x%llx "
1725 "registered 0x%llx length %d\n",
a56daeb7
AM
1726 __func__, (unsigned long long)ipb.addr,
1727 (unsigned long long)iov->addr, len);
c56c65fb
TT
1728
1729 if (IS_ERR(mr)) {
1730 *mrp = NULL;
1731 rc = PTR_ERR(mr);
1732 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1733 } else {
1734 *mrp = mr;
1735 iov->lkey = mr->lkey;
1736 rc = 0;
1737 }
1738
1739 return rc;
1740}
1741
1742int
1743rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1744 struct ib_mr *mr, struct ib_sge *iov)
1745{
1746 int rc;
1747
1748 ib_dma_unmap_single(ia->ri_id->device,
1749 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1750
1751 if (NULL == mr)
1752 return 0;
1753
1754 rc = ib_dereg_mr(mr);
1755 if (rc)
1756 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1757 return rc;
1758}
1759
1760/*
1761 * Wrappers for chunk registration, shared by read/write chunk code.
1762 */
1763
1764static void
1765rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1766{
1767 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1768 seg->mr_dmalen = seg->mr_len;
1769 if (seg->mr_page)
1770 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1771 seg->mr_page, offset_in_page(seg->mr_offset),
1772 seg->mr_dmalen, seg->mr_dir);
1773 else
1774 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1775 seg->mr_offset,
1776 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1777 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1778 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1779 __func__,
986d4abb
RD
1780 (unsigned long long)seg->mr_dma,
1781 seg->mr_offset, seg->mr_dmalen);
5c635e09 1782 }
c56c65fb
TT
1783}
1784
1785static void
1786rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1787{
1788 if (seg->mr_page)
1789 ib_dma_unmap_page(ia->ri_id->device,
1790 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1791 else
1792 ib_dma_unmap_single(ia->ri_id->device,
1793 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1794}
1795
3197d309
TT
1796static int
1797rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1798 int *nsegs, int writing, struct rpcrdma_ia *ia,
1799 struct rpcrdma_xprt *r_xprt)
1800{
1801 struct rpcrdma_mr_seg *seg1 = seg;
0dbb4108
CL
1802 struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw;
1803 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1804 struct ib_mr *mr = frmr->fr_mr;
f590e878 1805 struct ib_send_wr fastreg_wr, *bad_wr;
3197d309
TT
1806 u8 key;
1807 int len, pageoff;
1808 int i, rc;
9b78145c
TT
1809 int seg_len;
1810 u64 pa;
1811 int page_no;
3197d309
TT
1812
1813 pageoff = offset_in_page(seg1->mr_offset);
1814 seg1->mr_offset -= pageoff; /* start of page */
1815 seg1->mr_len += pageoff;
1816 len = -pageoff;
0fc6c4e7
SW
1817 if (*nsegs > ia->ri_max_frmr_depth)
1818 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1819 for (page_no = i = 0; i < *nsegs;) {
3197d309 1820 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1821 pa = seg->mr_dma;
1822 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
0dbb4108 1823 frmr->fr_pgl->page_list[page_no++] = pa;
9b78145c
TT
1824 pa += PAGE_SIZE;
1825 }
3197d309
TT
1826 len += seg->mr_len;
1827 ++seg;
1828 ++i;
1829 /* Check for holes */
1830 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1831 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1832 break;
1833 }
1834 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
0dbb4108 1835 __func__, mw, i);
3197d309 1836
05055722
CL
1837 frmr->fr_state = FRMR_IS_VALID;
1838
f590e878
CL
1839 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1840 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1841 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1842 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1843 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1844 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1845 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1846 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1847 if (fastreg_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1848 rc = -EIO;
1849 goto out_err;
c977dea2
CL
1850 }
1851
1852 /* Bump the key */
0dbb4108
CL
1853 key = (u8)(mr->rkey & 0x000000FF);
1854 ib_update_fast_reg_key(mr, ++key);
c977dea2 1855
f590e878 1856 fastreg_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1857 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1858 IB_ACCESS_REMOTE_READ);
f590e878 1859 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
3197d309
TT
1860 DECR_CQCOUNT(&r_xprt->rx_ep);
1861
f590e878 1862 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
3197d309
TT
1863 if (rc) {
1864 dprintk("RPC: %s: failed ib_post_send for register,"
1865 " status %i\n", __func__, rc);
c93e986a 1866 ib_update_fast_reg_key(mr, --key);
5fc83f47 1867 goto out_err;
3197d309 1868 } else {
0dbb4108 1869 seg1->mr_rkey = mr->rkey;
3197d309
TT
1870 seg1->mr_base = seg1->mr_dma + pageoff;
1871 seg1->mr_nsegs = i;
1872 seg1->mr_len = len;
1873 }
1874 *nsegs = i;
5fc83f47
CL
1875 return 0;
1876out_err:
05055722 1877 frmr->fr_state = FRMR_IS_INVALID;
5fc83f47
CL
1878 while (i--)
1879 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1880 return rc;
1881}
1882
1883static int
1884rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1885 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1886{
1887 struct rpcrdma_mr_seg *seg1 = seg;
1888 struct ib_send_wr invalidate_wr, *bad_wr;
1889 int rc;
1890
dab7e3b8
CL
1891 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1892
3197d309 1893 memset(&invalidate_wr, 0, sizeof invalidate_wr);
5c635e09 1894 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
3197d309 1895 invalidate_wr.opcode = IB_WR_LOCAL_INV;
3197d309
TT
1896 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1897 DECR_CQCOUNT(&r_xprt->rx_ep);
1898
73806c88
CL
1899 read_lock(&ia->ri_qplock);
1900 while (seg1->mr_nsegs--)
1901 rpcrdma_unmap_one(ia, seg++);
3197d309 1902 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1903 read_unlock(&ia->ri_qplock);
dab7e3b8
CL
1904 if (rc) {
1905 /* Force rpcrdma_buffer_get() to retry */
1906 seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
3197d309
TT
1907 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1908 " status %i\n", __func__, rc);
dab7e3b8 1909 }
3197d309
TT
1910 return rc;
1911}
1912
8d4ba034
TT
1913static int
1914rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1915 int *nsegs, int writing, struct rpcrdma_ia *ia)
1916{
1917 struct rpcrdma_mr_seg *seg1 = seg;
1918 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1919 int len, pageoff, i, rc;
1920
1921 pageoff = offset_in_page(seg1->mr_offset);
1922 seg1->mr_offset -= pageoff; /* start of page */
1923 seg1->mr_len += pageoff;
1924 len = -pageoff;
1925 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1926 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1927 for (i = 0; i < *nsegs;) {
1928 rpcrdma_map_one(ia, seg, writing);
1929 physaddrs[i] = seg->mr_dma;
1930 len += seg->mr_len;
1931 ++seg;
1932 ++i;
1933 /* Check for holes */
1934 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1935 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1936 break;
1937 }
1938 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1939 physaddrs, i, seg1->mr_dma);
1940 if (rc) {
1941 dprintk("RPC: %s: failed ib_map_phys_fmr "
1942 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1943 len, (unsigned long long)seg1->mr_dma,
1944 pageoff, i, rc);
1945 while (i--)
1946 rpcrdma_unmap_one(ia, --seg);
1947 } else {
1948 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1949 seg1->mr_base = seg1->mr_dma + pageoff;
1950 seg1->mr_nsegs = i;
1951 seg1->mr_len = len;
1952 }
1953 *nsegs = i;
1954 return rc;
1955}
1956
1957static int
1958rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1959 struct rpcrdma_ia *ia)
1960{
1961 struct rpcrdma_mr_seg *seg1 = seg;
1962 LIST_HEAD(l);
1963 int rc;
1964
1965 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1966 rc = ib_unmap_fmr(&l);
73806c88 1967 read_lock(&ia->ri_qplock);
8d4ba034
TT
1968 while (seg1->mr_nsegs--)
1969 rpcrdma_unmap_one(ia, seg++);
73806c88 1970 read_unlock(&ia->ri_qplock);
8d4ba034
TT
1971 if (rc)
1972 dprintk("RPC: %s: failed ib_unmap_fmr,"
1973 " status %i\n", __func__, rc);
1974 return rc;
1975}
1976
c56c65fb
TT
1977int
1978rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1979 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1980{
1981 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
1982 int rc = 0;
1983
1984 switch (ia->ri_memreg_strategy) {
1985
c56c65fb
TT
1986 case RPCRDMA_ALLPHYSICAL:
1987 rpcrdma_map_one(ia, seg, writing);
1988 seg->mr_rkey = ia->ri_bind_mem->rkey;
1989 seg->mr_base = seg->mr_dma;
1990 seg->mr_nsegs = 1;
1991 nsegs = 1;
1992 break;
c56c65fb 1993
3197d309
TT
1994 /* Registration using frmr registration */
1995 case RPCRDMA_FRMR:
1996 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1997 break;
1998
8d4ba034 1999 /* Registration using fmr memory registration */
c56c65fb 2000 case RPCRDMA_MTHCAFMR:
8d4ba034 2001 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
2002 break;
2003
c56c65fb 2004 default:
92b98361 2005 return -EIO;
c56c65fb
TT
2006 }
2007 if (rc)
92b98361 2008 return rc;
c56c65fb
TT
2009
2010 return nsegs;
2011}
2012
2013int
2014rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 2015 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
2016{
2017 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
2018 int nsegs = seg->mr_nsegs, rc;
2019
2020 switch (ia->ri_memreg_strategy) {
2021
c56c65fb 2022 case RPCRDMA_ALLPHYSICAL:
73806c88 2023 read_lock(&ia->ri_qplock);
c56c65fb 2024 rpcrdma_unmap_one(ia, seg);
73806c88 2025 read_unlock(&ia->ri_qplock);
c56c65fb 2026 break;
c56c65fb 2027
3197d309
TT
2028 case RPCRDMA_FRMR:
2029 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2030 break;
2031
c56c65fb 2032 case RPCRDMA_MTHCAFMR:
8d4ba034 2033 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
2034 break;
2035
c56c65fb 2036 default:
c56c65fb
TT
2037 break;
2038 }
c56c65fb
TT
2039 return nsegs;
2040}
2041
2042/*
2043 * Prepost any receive buffer, then post send.
2044 *
2045 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2046 */
2047int
2048rpcrdma_ep_post(struct rpcrdma_ia *ia,
2049 struct rpcrdma_ep *ep,
2050 struct rpcrdma_req *req)
2051{
2052 struct ib_send_wr send_wr, *send_wr_fail;
2053 struct rpcrdma_rep *rep = req->rl_reply;
2054 int rc;
2055
2056 if (rep) {
2057 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2058 if (rc)
2059 goto out;
2060 req->rl_reply = NULL;
2061 }
2062
2063 send_wr.next = NULL;
2064 send_wr.wr_id = 0ULL; /* no send cookie */
2065 send_wr.sg_list = req->rl_send_iov;
2066 send_wr.num_sge = req->rl_niovs;
2067 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
2068 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2069 ib_dma_sync_single_for_device(ia->ri_id->device,
2070 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2071 DMA_TO_DEVICE);
2072 ib_dma_sync_single_for_device(ia->ri_id->device,
2073 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2074 DMA_TO_DEVICE);
2075 ib_dma_sync_single_for_device(ia->ri_id->device,
2076 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2077 DMA_TO_DEVICE);
2078
2079 if (DECR_CQCOUNT(ep) > 0)
2080 send_wr.send_flags = 0;
2081 else { /* Provider must take a send completion every now and then */
2082 INIT_CQCOUNT(ep);
2083 send_wr.send_flags = IB_SEND_SIGNALED;
2084 }
2085
2086 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2087 if (rc)
2088 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2089 rc);
2090out:
2091 return rc;
2092}
2093
2094/*
2095 * (Re)post a receive buffer.
2096 */
2097int
2098rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2099 struct rpcrdma_ep *ep,
2100 struct rpcrdma_rep *rep)
2101{
2102 struct ib_recv_wr recv_wr, *recv_wr_fail;
2103 int rc;
2104
2105 recv_wr.next = NULL;
2106 recv_wr.wr_id = (u64) (unsigned long) rep;
2107 recv_wr.sg_list = &rep->rr_iov;
2108 recv_wr.num_sge = 1;
2109
2110 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2111 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2112
c56c65fb
TT
2113 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2114
2115 if (rc)
2116 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2117 rc);
2118 return rc;
2119}
43e95988
CL
2120
2121/* Physical mapping means one Read/Write list entry per-page.
2122 * All list entries must fit within an inline buffer
2123 *
2124 * NB: The server must return a Write list for NFS READ,
2125 * which has the same constraint. Factor in the inline
2126 * rsize as well.
2127 */
2128static size_t
2129rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2130{
2131 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2132 unsigned int inline_size, pages;
2133
2134 inline_size = min_t(unsigned int,
2135 cdata->inline_wsize, cdata->inline_rsize);
2136 inline_size -= RPCRDMA_HDRLEN_MIN;
2137 pages = inline_size / sizeof(struct rpcrdma_segment);
2138 return pages << PAGE_SHIFT;
2139}
2140
2141static size_t
2142rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2143{
2144 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2145}
2146
2147size_t
2148rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2149{
2150 size_t result;
2151
2152 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2153 case RPCRDMA_ALLPHYSICAL:
2154 result = rpcrdma_physical_max_payload(r_xprt);
2155 break;
2156 default:
2157 result = rpcrdma_mr_max_payload(r_xprt);
2158 }
2159 return result;
2160}