]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blame - net/sunrpc/xprtrdma/verbs.c
xprtrdma: Refactor rpcrdma_buffer_create() and rpcrdma_buffer_destroy()
[mirror_ubuntu-hirsute-kernel.git] / net / sunrpc / xprtrdma / verbs.c
CommitLineData
f58851e6 1/*
c56c65fb
TT
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
f58851e6
TT
38 */
39
c56c65fb
TT
40/*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
a6b7a407 50#include <linux/interrupt.h>
5a0e3ad6 51#include <linux/slab.h>
eba8ff66 52#include <linux/prefetch.h>
65866f82 53#include <asm/bitops.h>
c56c65fb 54
f58851e6
TT
55#include "xprt_rdma.h"
56
c56c65fb
TT
57/*
58 * Globals/Macros
59 */
60
f895b252 61#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
62# define RPCDBG_FACILITY RPCDBG_TRANS
63#endif
64
9f9d802a 65static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
467c9674 66static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
9f9d802a 67
c56c65fb
TT
68/*
69 * internal functions
70 */
71
72/*
73 * handle replies in tasklet context, using a single, global list
74 * rdma tasklet function -- just turn around and call the func
75 * for all replies on the list
76 */
77
78static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
79static LIST_HEAD(rpcrdma_tasklets_g);
80
81static void
82rpcrdma_run_tasklet(unsigned long data)
83{
84 struct rpcrdma_rep *rep;
85 void (*func)(struct rpcrdma_rep *);
86 unsigned long flags;
87
88 data = data;
89 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
90 while (!list_empty(&rpcrdma_tasklets_g)) {
91 rep = list_entry(rpcrdma_tasklets_g.next,
92 struct rpcrdma_rep, rr_list);
93 list_del(&rep->rr_list);
94 func = rep->rr_func;
95 rep->rr_func = NULL;
96 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
97
98 if (func)
99 func(rep);
100 else
101 rpcrdma_recv_buffer_put(rep);
102
103 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
104 }
105 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
106}
107
108static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
109
7ff11de1
CL
110static const char * const async_event[] = {
111 "CQ error",
112 "QP fatal error",
113 "QP request error",
114 "QP access error",
115 "communication established",
116 "send queue drained",
117 "path migration successful",
118 "path mig error",
119 "device fatal error",
120 "port active",
121 "port error",
122 "LID change",
123 "P_key change",
124 "SM change",
125 "SRQ error",
126 "SRQ limit reached",
127 "last WQE reached",
128 "client reregister",
129 "GID change",
130};
131
132#define ASYNC_MSG(status) \
133 ((status) < ARRAY_SIZE(async_event) ? \
134 async_event[(status)] : "unknown async error")
135
f1a03b76
CL
136static void
137rpcrdma_schedule_tasklet(struct list_head *sched_list)
138{
139 unsigned long flags;
140
141 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
142 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
143 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
144 tasklet_schedule(&rpcrdma_tasklet_g);
145}
146
c56c65fb
TT
147static void
148rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
149{
150 struct rpcrdma_ep *ep = context;
151
7ff11de1
CL
152 pr_err("RPC: %s: %s on device %s ep %p\n",
153 __func__, ASYNC_MSG(event->event),
154 event->device->name, context);
c56c65fb
TT
155 if (ep->rep_connected == 1) {
156 ep->rep_connected = -EIO;
afadc468 157 rpcrdma_conn_func(ep);
c56c65fb
TT
158 wake_up_all(&ep->rep_connect_wait);
159 }
160}
161
162static void
163rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
164{
165 struct rpcrdma_ep *ep = context;
166
7ff11de1
CL
167 pr_err("RPC: %s: %s on device %s ep %p\n",
168 __func__, ASYNC_MSG(event->event),
169 event->device->name, context);
c56c65fb
TT
170 if (ep->rep_connected == 1) {
171 ep->rep_connected = -EIO;
afadc468 172 rpcrdma_conn_func(ep);
c56c65fb
TT
173 wake_up_all(&ep->rep_connect_wait);
174 }
175}
176
8502427c
CL
177static const char * const wc_status[] = {
178 "success",
179 "local length error",
180 "local QP operation error",
181 "local EE context operation error",
182 "local protection error",
183 "WR flushed",
184 "memory management operation error",
185 "bad response error",
186 "local access error",
187 "remote invalid request error",
188 "remote access error",
189 "remote operation error",
190 "transport retry counter exceeded",
191 "RNR retrycounter exceeded",
192 "local RDD violation error",
193 "remove invalid RD request",
194 "operation aborted",
195 "invalid EE context number",
196 "invalid EE context state",
197 "fatal error",
198 "response timeout error",
199 "general error",
200};
201
202#define COMPLETION_MSG(status) \
203 ((status) < ARRAY_SIZE(wc_status) ? \
204 wc_status[(status)] : "unexpected completion error")
205
fc664485
CL
206static void
207rpcrdma_sendcq_process_wc(struct ib_wc *wc)
c56c65fb 208{
8502427c 209 if (likely(wc->status == IB_WC_SUCCESS))
c56c65fb 210 return;
8502427c
CL
211
212 /* WARNING: Only wr_id and status are reliable at this point */
213 if (wc->wr_id == 0ULL) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status));
217 } else {
218 struct rpcrdma_mw *r;
219
220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
221 r->r.frmr.fr_state = FRMR_IS_STALE;
222 pr_err("RPC: %s: frmr %p (stale): %s\n",
223 __func__, r, COMPLETION_MSG(wc->status));
224 }
c56c65fb
TT
225}
226
fc664485 227static int
1c00dd07 228rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
c56c65fb 229{
1c00dd07 230 struct ib_wc *wcs;
8301a2c0 231 int budget, count, rc;
c56c65fb 232
8301a2c0 233 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
234 do {
235 wcs = ep->rep_send_wcs;
236
237 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
238 if (rc <= 0)
239 return rc;
240
241 count = rc;
242 while (count-- > 0)
243 rpcrdma_sendcq_process_wc(wcs++);
8301a2c0 244 } while (rc == RPCRDMA_POLLSIZE && --budget);
1c00dd07 245 return 0;
fc664485 246}
c56c65fb 247
fc664485
CL
248/*
249 * Handle send, fast_reg_mr, and local_inv completions.
250 *
251 * Send events are typically suppressed and thus do not result
252 * in an upcall. Occasionally one is signaled, however. This
253 * prevents the provider's completion queue from wrapping and
254 * losing a completion.
255 */
256static void
257rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
258{
1c00dd07 259 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
fc664485
CL
260 int rc;
261
1c00dd07 262 rc = rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
263 if (rc) {
264 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
265 __func__, rc);
266 return;
c56c65fb
TT
267 }
268
7f23f6f6
CL
269 rc = ib_req_notify_cq(cq,
270 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
271 if (rc == 0)
272 return;
273 if (rc < 0) {
fc664485
CL
274 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
275 __func__, rc);
276 return;
277 }
278
1c00dd07 279 rpcrdma_sendcq_poll(cq, ep);
fc664485
CL
280}
281
282static void
bb96193d 283rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
fc664485
CL
284{
285 struct rpcrdma_rep *rep =
286 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
287
8502427c
CL
288 /* WARNING: Only wr_id and status are reliable at this point */
289 if (wc->status != IB_WC_SUCCESS)
290 goto out_fail;
fc664485 291
8502427c 292 /* status == SUCCESS means all fields in wc are trustworthy */
fc664485
CL
293 if (wc->opcode != IB_WC_RECV)
294 return;
295
8502427c
CL
296 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
297 __func__, rep, wc->byte_len);
298
fc664485
CL
299 rep->rr_len = wc->byte_len;
300 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
301 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
eba8ff66 302 prefetch(rep->rr_base);
fc664485
CL
303
304out_schedule:
bb96193d 305 list_add_tail(&rep->rr_list, sched_list);
8502427c
CL
306 return;
307out_fail:
308 if (wc->status != IB_WC_WR_FLUSH_ERR)
309 pr_err("RPC: %s: rep %p: %s\n",
310 __func__, rep, COMPLETION_MSG(wc->status));
311 rep->rr_len = ~0U;
312 goto out_schedule;
fc664485
CL
313}
314
315static int
1c00dd07 316rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
fc664485 317{
bb96193d 318 struct list_head sched_list;
1c00dd07 319 struct ib_wc *wcs;
8301a2c0 320 int budget, count, rc;
fc664485 321
bb96193d 322 INIT_LIST_HEAD(&sched_list);
8301a2c0 323 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
1c00dd07
CL
324 do {
325 wcs = ep->rep_recv_wcs;
326
327 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
328 if (rc <= 0)
bb96193d 329 goto out_schedule;
1c00dd07
CL
330
331 count = rc;
332 while (count-- > 0)
bb96193d 333 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
8301a2c0 334 } while (rc == RPCRDMA_POLLSIZE && --budget);
bb96193d
CL
335 rc = 0;
336
337out_schedule:
f1a03b76 338 rpcrdma_schedule_tasklet(&sched_list);
bb96193d 339 return rc;
c56c65fb
TT
340}
341
342/*
fc664485 343 * Handle receive completions.
c56c65fb 344 *
c56c65fb
TT
345 * It is reentrant but processes single events in order to maintain
346 * ordering of receives to keep server credits.
347 *
348 * It is the responsibility of the scheduled tasklet to return
349 * recv buffers to the pool. NOTE: this affects synchronization of
350 * connection shutdown. That is, the structures required for
351 * the completion of the reply handler must remain intact until
352 * all memory has been reclaimed.
c56c65fb
TT
353 */
354static void
fc664485 355rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
c56c65fb 356{
1c00dd07 357 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
c56c65fb
TT
358 int rc;
359
1c00dd07 360 rc = rpcrdma_recvcq_poll(cq, ep);
fc664485
CL
361 if (rc) {
362 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
363 __func__, rc);
c56c65fb 364 return;
fc664485 365 }
c56c65fb 366
7f23f6f6
CL
367 rc = ib_req_notify_cq(cq,
368 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
369 if (rc == 0)
370 return;
371 if (rc < 0) {
fc664485 372 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
c56c65fb
TT
373 __func__, rc);
374 return;
375 }
376
1c00dd07 377 rpcrdma_recvcq_poll(cq, ep);
c56c65fb
TT
378}
379
a7bc211a
CL
380static void
381rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
382{
5c166bef
CL
383 struct ib_wc wc;
384 LIST_HEAD(sched_list);
385
386 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
387 rpcrdma_recvcq_process_wc(&wc, &sched_list);
388 if (!list_empty(&sched_list))
389 rpcrdma_schedule_tasklet(&sched_list);
390 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
391 rpcrdma_sendcq_process_wc(&wc);
a7bc211a
CL
392}
393
f895b252 394#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb
TT
395static const char * const conn[] = {
396 "address resolved",
397 "address error",
398 "route resolved",
399 "route error",
400 "connect request",
401 "connect response",
402 "connect error",
403 "unreachable",
404 "rejected",
405 "established",
406 "disconnected",
8079fb78
CL
407 "device removal",
408 "multicast join",
409 "multicast error",
410 "address change",
411 "timewait exit",
c56c65fb 412};
8079fb78
CL
413
414#define CONNECTION_MSG(status) \
415 ((status) < ARRAY_SIZE(conn) ? \
416 conn[(status)] : "unrecognized connection error")
c56c65fb
TT
417#endif
418
419static int
420rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
421{
422 struct rpcrdma_xprt *xprt = id->context;
423 struct rpcrdma_ia *ia = &xprt->rx_ia;
424 struct rpcrdma_ep *ep = &xprt->rx_ep;
f895b252 425#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
c56c65fb 426 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
ff0db049 427#endif
ce1ab9ab
CL
428 struct ib_qp_attr *attr = &ia->ri_qp_attr;
429 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
c56c65fb
TT
430 int connstate = 0;
431
432 switch (event->event) {
433 case RDMA_CM_EVENT_ADDR_RESOLVED:
434 case RDMA_CM_EVENT_ROUTE_RESOLVED:
5675add3 435 ia->ri_async_rc = 0;
c56c65fb
TT
436 complete(&ia->ri_done);
437 break;
438 case RDMA_CM_EVENT_ADDR_ERROR:
439 ia->ri_async_rc = -EHOSTUNREACH;
440 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
441 __func__, ep);
442 complete(&ia->ri_done);
443 break;
444 case RDMA_CM_EVENT_ROUTE_ERROR:
445 ia->ri_async_rc = -ENETUNREACH;
446 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
447 __func__, ep);
448 complete(&ia->ri_done);
449 break;
450 case RDMA_CM_EVENT_ESTABLISHED:
451 connstate = 1;
ce1ab9ab
CL
452 ib_query_qp(ia->ri_id->qp, attr,
453 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
454 iattr);
c56c65fb
TT
455 dprintk("RPC: %s: %d responder resources"
456 " (%d initiator)\n",
ce1ab9ab
CL
457 __func__, attr->max_dest_rd_atomic,
458 attr->max_rd_atomic);
c56c65fb
TT
459 goto connected;
460 case RDMA_CM_EVENT_CONNECT_ERROR:
461 connstate = -ENOTCONN;
462 goto connected;
463 case RDMA_CM_EVENT_UNREACHABLE:
464 connstate = -ENETDOWN;
465 goto connected;
466 case RDMA_CM_EVENT_REJECTED:
467 connstate = -ECONNREFUSED;
468 goto connected;
469 case RDMA_CM_EVENT_DISCONNECTED:
470 connstate = -ECONNABORTED;
471 goto connected;
472 case RDMA_CM_EVENT_DEVICE_REMOVAL:
473 connstate = -ENODEV;
474connected:
c56c65fb
TT
475 dprintk("RPC: %s: %sconnected\n",
476 __func__, connstate > 0 ? "" : "dis");
477 ep->rep_connected = connstate;
afadc468 478 rpcrdma_conn_func(ep);
c56c65fb 479 wake_up_all(&ep->rep_connect_wait);
8079fb78 480 /*FALLTHROUGH*/
c56c65fb 481 default:
8079fb78
CL
482 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
483 __func__, &addr->sin_addr.s_addr,
484 ntohs(addr->sin_port), ep,
485 CONNECTION_MSG(event->event));
c56c65fb
TT
486 break;
487 }
488
f895b252 489#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
b3cd8d45 490 if (connstate == 1) {
ce1ab9ab 491 int ird = attr->max_dest_rd_atomic;
b3cd8d45 492 int tird = ep->rep_remote_cma.responder_resources;
21454aaa 493 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
b3cd8d45 494 "on %s, memreg %d slots %d ird %d%s\n",
21454aaa 495 &addr->sin_addr.s_addr,
b3cd8d45
TT
496 ntohs(addr->sin_port),
497 ia->ri_id->device->name,
498 ia->ri_memreg_strategy,
499 xprt->rx_buf.rb_max_requests,
500 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
501 } else if (connstate < 0) {
21454aaa
HH
502 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
503 &addr->sin_addr.s_addr,
b3cd8d45
TT
504 ntohs(addr->sin_port),
505 connstate);
506 }
507#endif
508
c56c65fb
TT
509 return 0;
510}
511
512static struct rdma_cm_id *
513rpcrdma_create_id(struct rpcrdma_xprt *xprt,
514 struct rpcrdma_ia *ia, struct sockaddr *addr)
515{
516 struct rdma_cm_id *id;
517 int rc;
518
1a954051
TT
519 init_completion(&ia->ri_done);
520
b26f9b99 521 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
c56c65fb
TT
522 if (IS_ERR(id)) {
523 rc = PTR_ERR(id);
524 dprintk("RPC: %s: rdma_create_id() failed %i\n",
525 __func__, rc);
526 return id;
527 }
528
5675add3 529 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
530 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
531 if (rc) {
532 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
533 __func__, rc);
534 goto out;
535 }
5675add3
TT
536 wait_for_completion_interruptible_timeout(&ia->ri_done,
537 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
538 rc = ia->ri_async_rc;
539 if (rc)
540 goto out;
541
5675add3 542 ia->ri_async_rc = -ETIMEDOUT;
c56c65fb
TT
543 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
544 if (rc) {
545 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
546 __func__, rc);
547 goto out;
548 }
5675add3
TT
549 wait_for_completion_interruptible_timeout(&ia->ri_done,
550 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
c56c65fb
TT
551 rc = ia->ri_async_rc;
552 if (rc)
553 goto out;
554
555 return id;
556
557out:
558 rdma_destroy_id(id);
559 return ERR_PTR(rc);
560}
561
562/*
563 * Drain any cq, prior to teardown.
564 */
565static void
566rpcrdma_clean_cq(struct ib_cq *cq)
567{
568 struct ib_wc wc;
569 int count = 0;
570
571 while (1 == ib_poll_cq(cq, 1, &wc))
572 ++count;
573
574 if (count)
575 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
576 __func__, count, wc.opcode);
577}
578
579/*
580 * Exported functions.
581 */
582
583/*
584 * Open and initialize an Interface Adapter.
585 * o initializes fields of struct rpcrdma_ia, including
586 * interface and provider attributes and protection zone.
587 */
588int
589rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
590{
bd7ed1d1 591 int rc, mem_priv;
c56c65fb 592 struct rpcrdma_ia *ia = &xprt->rx_ia;
7bc7972c 593 struct ib_device_attr *devattr = &ia->ri_devattr;
c56c65fb 594
c56c65fb
TT
595 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
596 if (IS_ERR(ia->ri_id)) {
597 rc = PTR_ERR(ia->ri_id);
598 goto out1;
599 }
600
601 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
602 if (IS_ERR(ia->ri_pd)) {
603 rc = PTR_ERR(ia->ri_pd);
604 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
605 __func__, rc);
606 goto out2;
607 }
608
7bc7972c 609 rc = ib_query_device(ia->ri_id->device, devattr);
bd7ed1d1
TT
610 if (rc) {
611 dprintk("RPC: %s: ib_query_device failed %d\n",
612 __func__, rc);
5ae711a2 613 goto out3;
bd7ed1d1
TT
614 }
615
7bc7972c 616 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
bd7ed1d1
TT
617 ia->ri_have_dma_lkey = 1;
618 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
619 }
620
f10eafd3 621 if (memreg == RPCRDMA_FRMR) {
3197d309 622 /* Requires both frmr reg and local dma lkey */
7bc7972c 623 if ((devattr->device_cap_flags &
3197d309
TT
624 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
625 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
3197d309 626 dprintk("RPC: %s: FRMR registration "
f10eafd3
CL
627 "not supported by HCA\n", __func__);
628 memreg = RPCRDMA_MTHCAFMR;
0fc6c4e7
SW
629 } else {
630 /* Mind the ia limit on FRMR page list depth */
631 ia->ri_max_frmr_depth = min_t(unsigned int,
632 RPCRDMA_MAX_DATA_SEGS,
7bc7972c 633 devattr->max_fast_reg_page_list_len);
bd7ed1d1 634 }
f10eafd3
CL
635 }
636 if (memreg == RPCRDMA_MTHCAFMR) {
637 if (!ia->ri_id->device->alloc_fmr) {
638 dprintk("RPC: %s: MTHCAFMR registration "
639 "not supported by HCA\n", __func__);
f10eafd3 640 memreg = RPCRDMA_ALLPHYSICAL;
f10eafd3 641 }
bd7ed1d1
TT
642 }
643
c56c65fb
TT
644 /*
645 * Optionally obtain an underlying physical identity mapping in
646 * order to do a memory window-based bind. This base registration
647 * is protected from remote access - that is enabled only by binding
648 * for the specific bytes targeted during each RPC operation, and
649 * revoked after the corresponding completion similar to a storage
650 * adapter.
651 */
bd7ed1d1 652 switch (memreg) {
3197d309 653 case RPCRDMA_FRMR:
bd7ed1d1 654 break;
bd7ed1d1
TT
655 case RPCRDMA_ALLPHYSICAL:
656 mem_priv = IB_ACCESS_LOCAL_WRITE |
657 IB_ACCESS_REMOTE_WRITE |
658 IB_ACCESS_REMOTE_READ;
659 goto register_setup;
bd7ed1d1
TT
660 case RPCRDMA_MTHCAFMR:
661 if (ia->ri_have_dma_lkey)
c56c65fb 662 break;
bd7ed1d1
TT
663 mem_priv = IB_ACCESS_LOCAL_WRITE;
664 register_setup:
c56c65fb
TT
665 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
666 if (IS_ERR(ia->ri_bind_mem)) {
667 printk(KERN_ALERT "%s: ib_get_dma_mr for "
0ac531c1 668 "phys register failed with %lX\n",
c56c65fb 669 __func__, PTR_ERR(ia->ri_bind_mem));
0ac531c1 670 rc = -ENOMEM;
5ae711a2 671 goto out3;
c56c65fb 672 }
bd7ed1d1
TT
673 break;
674 default:
cdd9ade7
CL
675 printk(KERN_ERR "RPC: Unsupported memory "
676 "registration mode: %d\n", memreg);
677 rc = -ENOMEM;
5ae711a2 678 goto out3;
c56c65fb 679 }
bd7ed1d1
TT
680 dprintk("RPC: %s: memory registration strategy is %d\n",
681 __func__, memreg);
c56c65fb
TT
682
683 /* Else will do memory reg/dereg for each chunk */
684 ia->ri_memreg_strategy = memreg;
685
73806c88 686 rwlock_init(&ia->ri_qplock);
c56c65fb 687 return 0;
5ae711a2
CL
688
689out3:
690 ib_dealloc_pd(ia->ri_pd);
691 ia->ri_pd = NULL;
c56c65fb
TT
692out2:
693 rdma_destroy_id(ia->ri_id);
fee08caf 694 ia->ri_id = NULL;
c56c65fb
TT
695out1:
696 return rc;
697}
698
699/*
700 * Clean up/close an IA.
701 * o if event handles and PD have been initialized, free them.
702 * o close the IA
703 */
704void
705rpcrdma_ia_close(struct rpcrdma_ia *ia)
706{
707 int rc;
708
709 dprintk("RPC: %s: entering\n", __func__);
710 if (ia->ri_bind_mem != NULL) {
711 rc = ib_dereg_mr(ia->ri_bind_mem);
712 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
713 __func__, rc);
714 }
fee08caf
TT
715 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
716 if (ia->ri_id->qp)
717 rdma_destroy_qp(ia->ri_id);
718 rdma_destroy_id(ia->ri_id);
719 ia->ri_id = NULL;
720 }
c56c65fb
TT
721 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
722 rc = ib_dealloc_pd(ia->ri_pd);
723 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
724 __func__, rc);
725 }
c56c65fb
TT
726}
727
728/*
729 * Create unconnected endpoint.
730 */
731int
732rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
733 struct rpcrdma_create_data_internal *cdata)
734{
7bc7972c 735 struct ib_device_attr *devattr = &ia->ri_devattr;
fc664485 736 struct ib_cq *sendcq, *recvcq;
5d40a8a5 737 int rc, err;
c56c65fb 738
c56c65fb 739 /* check provider's send/recv wr limits */
7bc7972c
CL
740 if (cdata->max_requests > devattr->max_qp_wr)
741 cdata->max_requests = devattr->max_qp_wr;
c56c65fb
TT
742
743 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
744 ep->rep_attr.qp_context = ep;
745 /* send_cq and recv_cq initialized below */
746 ep->rep_attr.srq = NULL;
747 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
748 switch (ia->ri_memreg_strategy) {
0fc6c4e7
SW
749 case RPCRDMA_FRMR: {
750 int depth = 7;
751
15cdc644
TT
752 /* Add room for frmr register and invalidate WRs.
753 * 1. FRMR reg WR for head
754 * 2. FRMR invalidate WR for head
0fc6c4e7
SW
755 * 3. N FRMR reg WRs for pagelist
756 * 4. N FRMR invalidate WRs for pagelist
15cdc644
TT
757 * 5. FRMR reg WR for tail
758 * 6. FRMR invalidate WR for tail
759 * 7. The RDMA_SEND WR
760 */
0fc6c4e7
SW
761
762 /* Calculate N if the device max FRMR depth is smaller than
763 * RPCRDMA_MAX_DATA_SEGS.
764 */
765 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
766 int delta = RPCRDMA_MAX_DATA_SEGS -
767 ia->ri_max_frmr_depth;
768
769 do {
770 depth += 2; /* FRMR reg + invalidate */
771 delta -= ia->ri_max_frmr_depth;
772 } while (delta > 0);
773
774 }
775 ep->rep_attr.cap.max_send_wr *= depth;
7bc7972c
CL
776 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
777 cdata->max_requests = devattr->max_qp_wr / depth;
15cdc644
TT
778 if (!cdata->max_requests)
779 return -EINVAL;
0fc6c4e7
SW
780 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
781 depth;
15cdc644 782 }
3197d309 783 break;
0fc6c4e7 784 }
c56c65fb
TT
785 default:
786 break;
787 }
788 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
789 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
790 ep->rep_attr.cap.max_recv_sge = 1;
791 ep->rep_attr.cap.max_inline_data = 0;
792 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
793 ep->rep_attr.qp_type = IB_QPT_RC;
794 ep->rep_attr.port_num = ~0;
795
796 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
797 "iovs: send %d recv %d\n",
798 __func__,
799 ep->rep_attr.cap.max_send_wr,
800 ep->rep_attr.cap.max_recv_wr,
801 ep->rep_attr.cap.max_send_sge,
802 ep->rep_attr.cap.max_recv_sge);
803
804 /* set trigger for requesting send completion */
fc664485 805 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
e7104a2a
CL
806 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
807 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
808 else if (ep->rep_cqinit <= 2)
c56c65fb
TT
809 ep->rep_cqinit = 0;
810 INIT_CQCOUNT(ep);
c56c65fb 811 init_waitqueue_head(&ep->rep_connect_wait);
254f91e2 812 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
c56c65fb 813
fc664485 814 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
1c00dd07 815 rpcrdma_cq_async_error_upcall, ep,
c56c65fb 816 ep->rep_attr.cap.max_send_wr + 1, 0);
fc664485
CL
817 if (IS_ERR(sendcq)) {
818 rc = PTR_ERR(sendcq);
819 dprintk("RPC: %s: failed to create send CQ: %i\n",
c56c65fb
TT
820 __func__, rc);
821 goto out1;
822 }
823
fc664485 824 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
c56c65fb
TT
825 if (rc) {
826 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
827 __func__, rc);
828 goto out2;
829 }
830
fc664485 831 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
1c00dd07 832 rpcrdma_cq_async_error_upcall, ep,
fc664485
CL
833 ep->rep_attr.cap.max_recv_wr + 1, 0);
834 if (IS_ERR(recvcq)) {
835 rc = PTR_ERR(recvcq);
836 dprintk("RPC: %s: failed to create recv CQ: %i\n",
837 __func__, rc);
838 goto out2;
839 }
840
841 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
842 if (rc) {
843 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
844 __func__, rc);
845 ib_destroy_cq(recvcq);
846 goto out2;
847 }
848
849 ep->rep_attr.send_cq = sendcq;
850 ep->rep_attr.recv_cq = recvcq;
c56c65fb
TT
851
852 /* Initialize cma parameters */
853
854 /* RPC/RDMA does not use private data */
855 ep->rep_remote_cma.private_data = NULL;
856 ep->rep_remote_cma.private_data_len = 0;
857
858 /* Client offers RDMA Read but does not initiate */
b334eaab 859 ep->rep_remote_cma.initiator_depth = 0;
7bc7972c 860 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
b334eaab
TT
861 ep->rep_remote_cma.responder_resources = 32;
862 else
7bc7972c
CL
863 ep->rep_remote_cma.responder_resources =
864 devattr->max_qp_rd_atom;
c56c65fb
TT
865
866 ep->rep_remote_cma.retry_count = 7;
867 ep->rep_remote_cma.flow_control = 0;
868 ep->rep_remote_cma.rnr_retry_count = 0;
869
870 return 0;
871
872out2:
fc664485 873 err = ib_destroy_cq(sendcq);
5d40a8a5
CL
874 if (err)
875 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
876 __func__, err);
c56c65fb
TT
877out1:
878 return rc;
879}
880
881/*
882 * rpcrdma_ep_destroy
883 *
884 * Disconnect and destroy endpoint. After this, the only
885 * valid operations on the ep are to free it (if dynamically
886 * allocated) or re-create it.
c56c65fb 887 */
7f1d5419 888void
c56c65fb
TT
889rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
890{
891 int rc;
892
893 dprintk("RPC: %s: entering, connected is %d\n",
894 __func__, ep->rep_connected);
895
254f91e2
CL
896 cancel_delayed_work_sync(&ep->rep_connect_worker);
897
c56c65fb 898 if (ia->ri_id->qp) {
282191cb 899 rpcrdma_ep_disconnect(ep, ia);
fee08caf
TT
900 rdma_destroy_qp(ia->ri_id);
901 ia->ri_id->qp = NULL;
c56c65fb
TT
902 }
903
c56c65fb
TT
904 /* padding - could be done in rpcrdma_buffer_destroy... */
905 if (ep->rep_pad_mr) {
906 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
907 ep->rep_pad_mr = NULL;
908 }
909
fc664485
CL
910 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
911 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
912 if (rc)
913 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
914 __func__, rc);
915
916 rpcrdma_clean_cq(ep->rep_attr.send_cq);
917 rc = ib_destroy_cq(ep->rep_attr.send_cq);
c56c65fb
TT
918 if (rc)
919 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
920 __func__, rc);
c56c65fb
TT
921}
922
923/*
924 * Connect unconnected endpoint.
925 */
926int
927rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
928{
73806c88 929 struct rdma_cm_id *id, *old;
c56c65fb
TT
930 int rc = 0;
931 int retry_count = 0;
c56c65fb 932
c055551e 933 if (ep->rep_connected != 0) {
c56c65fb
TT
934 struct rpcrdma_xprt *xprt;
935retry:
ec62f40d 936 dprintk("RPC: %s: reconnecting...\n", __func__);
282191cb
CL
937
938 rpcrdma_ep_disconnect(ep, ia);
a7bc211a 939 rpcrdma_flush_cqs(ep);
c56c65fb 940
467c9674
CL
941 switch (ia->ri_memreg_strategy) {
942 case RPCRDMA_FRMR:
9f9d802a 943 rpcrdma_reset_frmrs(ia);
467c9674
CL
944 break;
945 case RPCRDMA_MTHCAFMR:
946 rpcrdma_reset_fmrs(ia);
947 break;
948 case RPCRDMA_ALLPHYSICAL:
949 break;
950 default:
951 rc = -EIO;
952 goto out;
953 }
9f9d802a 954
c56c65fb
TT
955 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
956 id = rpcrdma_create_id(xprt, ia,
957 (struct sockaddr *)&xprt->rx_data.addr);
958 if (IS_ERR(id)) {
ec62f40d 959 rc = -EHOSTUNREACH;
c56c65fb
TT
960 goto out;
961 }
962 /* TEMP TEMP TEMP - fail if new device:
963 * Deregister/remarshal *all* requests!
964 * Close and recreate adapter, pd, etc!
965 * Re-determine all attributes still sane!
966 * More stuff I haven't thought of!
967 * Rrrgh!
968 */
969 if (ia->ri_id->device != id->device) {
970 printk("RPC: %s: can't reconnect on "
971 "different device!\n", __func__);
972 rdma_destroy_id(id);
ec62f40d 973 rc = -ENETUNREACH;
c56c65fb
TT
974 goto out;
975 }
976 /* END TEMP */
ec62f40d
CL
977 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
978 if (rc) {
979 dprintk("RPC: %s: rdma_create_qp failed %i\n",
980 __func__, rc);
981 rdma_destroy_id(id);
982 rc = -ENETUNREACH;
983 goto out;
984 }
73806c88
CL
985
986 write_lock(&ia->ri_qplock);
987 old = ia->ri_id;
c56c65fb 988 ia->ri_id = id;
73806c88
CL
989 write_unlock(&ia->ri_qplock);
990
991 rdma_destroy_qp(old);
992 rdma_destroy_id(old);
ec62f40d
CL
993 } else {
994 dprintk("RPC: %s: connecting...\n", __func__);
995 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
996 if (rc) {
997 dprintk("RPC: %s: rdma_create_qp failed %i\n",
998 __func__, rc);
999 /* do not update ep->rep_connected */
1000 return -ENETUNREACH;
1001 }
c56c65fb
TT
1002 }
1003
c56c65fb
TT
1004 ep->rep_connected = 0;
1005
1006 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1007 if (rc) {
1008 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1009 __func__, rc);
1010 goto out;
1011 }
1012
c56c65fb
TT
1013 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1014
1015 /*
1016 * Check state. A non-peer reject indicates no listener
1017 * (ECONNREFUSED), which may be a transient state. All
1018 * others indicate a transport condition which has already
1019 * undergone a best-effort.
1020 */
f64f9e71
JP
1021 if (ep->rep_connected == -ECONNREFUSED &&
1022 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
c56c65fb
TT
1023 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1024 goto retry;
1025 }
1026 if (ep->rep_connected <= 0) {
1027 /* Sometimes, the only way to reliably connect to remote
1028 * CMs is to use same nonzero values for ORD and IRD. */
b334eaab
TT
1029 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1030 (ep->rep_remote_cma.responder_resources == 0 ||
1031 ep->rep_remote_cma.initiator_depth !=
1032 ep->rep_remote_cma.responder_resources)) {
1033 if (ep->rep_remote_cma.responder_resources == 0)
1034 ep->rep_remote_cma.responder_resources = 1;
1035 ep->rep_remote_cma.initiator_depth =
1036 ep->rep_remote_cma.responder_resources;
c56c65fb 1037 goto retry;
b334eaab 1038 }
c56c65fb
TT
1039 rc = ep->rep_connected;
1040 } else {
1041 dprintk("RPC: %s: connected\n", __func__);
1042 }
1043
1044out:
1045 if (rc)
1046 ep->rep_connected = rc;
1047 return rc;
1048}
1049
1050/*
1051 * rpcrdma_ep_disconnect
1052 *
1053 * This is separate from destroy to facilitate the ability
1054 * to reconnect without recreating the endpoint.
1055 *
1056 * This call is not reentrant, and must not be made in parallel
1057 * on the same endpoint.
1058 */
282191cb 1059void
c56c65fb
TT
1060rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1061{
1062 int rc;
1063
a7bc211a 1064 rpcrdma_flush_cqs(ep);
c56c65fb
TT
1065 rc = rdma_disconnect(ia->ri_id);
1066 if (!rc) {
1067 /* returns without wait if not connected */
1068 wait_event_interruptible(ep->rep_connect_wait,
1069 ep->rep_connected != 1);
1070 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1071 (ep->rep_connected == 1) ? "still " : "dis");
1072 } else {
1073 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1074 ep->rep_connected = rc;
1075 }
c56c65fb
TT
1076}
1077
1392402c
CL
1078static struct rpcrdma_req *
1079rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1080{
1081 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1082 size_t wlen = 1 << fls(cdata->inline_wsize +
1083 sizeof(struct rpcrdma_req));
1084 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1085 struct rpcrdma_req *req;
1086 int rc;
1087
1088 rc = -ENOMEM;
1089 req = kmalloc(wlen, GFP_KERNEL);
1090 if (req == NULL)
1091 goto out;
1092 memset(req, 0, sizeof(struct rpcrdma_req));
1093
1094 rc = rpcrdma_register_internal(ia, req->rl_base, wlen -
1095 offsetof(struct rpcrdma_req, rl_base),
1096 &req->rl_handle, &req->rl_iov);
1097 if (rc)
1098 goto out_free;
1099
1100 req->rl_size = wlen - sizeof(struct rpcrdma_req);
1101 req->rl_buffer = &r_xprt->rx_buf;
1102 return req;
1103
1104out_free:
1105 kfree(req);
1106out:
1107 return ERR_PTR(rc);
1108}
1109
1110static struct rpcrdma_rep *
1111rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1112{
1113 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1114 size_t rlen = 1 << fls(cdata->inline_rsize +
1115 sizeof(struct rpcrdma_rep));
1116 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1117 struct rpcrdma_rep *rep;
1118 int rc;
1119
1120 rc = -ENOMEM;
1121 rep = kmalloc(rlen, GFP_KERNEL);
1122 if (rep == NULL)
1123 goto out;
1124 memset(rep, 0, sizeof(struct rpcrdma_rep));
1125
1126 rc = rpcrdma_register_internal(ia, rep->rr_base, rlen -
1127 offsetof(struct rpcrdma_rep, rr_base),
1128 &rep->rr_handle, &rep->rr_iov);
1129 if (rc)
1130 goto out_free;
1131
1132 rep->rr_buffer = &r_xprt->rx_buf;
1133 return rep;
1134
1135out_free:
1136 kfree(rep);
1137out:
1138 return ERR_PTR(rc);
1139}
1140
2e84522c
CL
1141static int
1142rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1143{
1144 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1145 struct ib_fmr_attr fmr_attr = {
1146 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1147 .max_maps = 1,
1148 .page_shift = PAGE_SHIFT
1149 };
1150 struct rpcrdma_mw *r;
1151 int i, rc;
1152
1153 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1154 dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
1155
1156 while (i--) {
1157 r = kzalloc(sizeof(*r), GFP_KERNEL);
1158 if (r == NULL)
1159 return -ENOMEM;
1160
1161 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1162 if (IS_ERR(r->r.fmr)) {
1163 rc = PTR_ERR(r->r.fmr);
1164 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1165 __func__, rc);
1166 goto out_free;
1167 }
1168
1169 list_add(&r->mw_list, &buf->rb_mws);
1170 list_add(&r->mw_all, &buf->rb_all);
1171 }
1172 return 0;
1173
1174out_free:
1175 kfree(r);
1176 return rc;
1177}
1178
1179static int
1180rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1181{
1182 struct rpcrdma_frmr *f;
1183 struct rpcrdma_mw *r;
1184 int i, rc;
1185
1186 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1187 dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
1188
1189 while (i--) {
1190 r = kzalloc(sizeof(*r), GFP_KERNEL);
1191 if (r == NULL)
1192 return -ENOMEM;
1193 f = &r->r.frmr;
1194
1195 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1196 ia->ri_max_frmr_depth);
1197 if (IS_ERR(f->fr_mr)) {
1198 rc = PTR_ERR(f->fr_mr);
1199 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1200 "failed %i\n", __func__, rc);
1201 goto out_free;
1202 }
1203
1204 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1205 ia->ri_max_frmr_depth);
1206 if (IS_ERR(f->fr_pgl)) {
1207 rc = PTR_ERR(f->fr_pgl);
1208 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1209 "failed %i\n", __func__, rc);
1210
1211 ib_dereg_mr(f->fr_mr);
1212 goto out_free;
1213 }
1214
1215 list_add(&r->mw_list, &buf->rb_mws);
1216 list_add(&r->mw_all, &buf->rb_all);
1217 }
1218
1219 return 0;
1220
1221out_free:
1222 kfree(r);
1223 return rc;
1224}
1225
c56c65fb 1226int
ac920d04 1227rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
c56c65fb 1228{
ac920d04
CL
1229 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1230 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1231 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
c56c65fb 1232 char *p;
1392402c 1233 size_t len;
c56c65fb
TT
1234 int i, rc;
1235
1236 buf->rb_max_requests = cdata->max_requests;
1237 spin_lock_init(&buf->rb_lock);
c56c65fb
TT
1238
1239 /* Need to allocate:
1240 * 1. arrays for send and recv pointers
1241 * 2. arrays of struct rpcrdma_req to fill in pointers
1242 * 3. array of struct rpcrdma_rep for replies
1243 * 4. padding, if any
c56c65fb
TT
1244 * Send/recv buffers in req/rep need to be registered
1245 */
c56c65fb
TT
1246 len = buf->rb_max_requests *
1247 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1248 len += cdata->padding;
c56c65fb 1249
c56c65fb
TT
1250 p = kzalloc(len, GFP_KERNEL);
1251 if (p == NULL) {
1252 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1253 __func__, len);
1254 rc = -ENOMEM;
1255 goto out;
1256 }
1257 buf->rb_pool = p; /* for freeing it later */
1258
1259 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1260 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1261 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1262 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1263
1264 /*
1265 * Register the zeroed pad buffer, if any.
1266 */
1267 if (cdata->padding) {
ac920d04 1268 struct rpcrdma_ep *ep = &r_xprt->rx_ep;
c56c65fb
TT
1269 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1270 &ep->rep_pad_mr, &ep->rep_pad);
1271 if (rc)
1272 goto out;
1273 }
1274 p += cdata->padding;
1275
c56c65fb 1276 INIT_LIST_HEAD(&buf->rb_mws);
3111d72c 1277 INIT_LIST_HEAD(&buf->rb_all);
c56c65fb 1278 switch (ia->ri_memreg_strategy) {
3197d309 1279 case RPCRDMA_FRMR:
2e84522c
CL
1280 rc = rpcrdma_init_frmrs(ia, buf);
1281 if (rc)
1282 goto out;
3197d309 1283 break;
c56c65fb 1284 case RPCRDMA_MTHCAFMR:
2e84522c
CL
1285 rc = rpcrdma_init_fmrs(ia, buf);
1286 if (rc)
1287 goto out;
c56c65fb 1288 break;
c56c65fb
TT
1289 default:
1290 break;
1291 }
1292
c56c65fb
TT
1293 for (i = 0; i < buf->rb_max_requests; i++) {
1294 struct rpcrdma_req *req;
1295 struct rpcrdma_rep *rep;
1296
1392402c
CL
1297 req = rpcrdma_create_req(r_xprt);
1298 if (IS_ERR(req)) {
c56c65fb
TT
1299 dprintk("RPC: %s: request buffer %d alloc"
1300 " failed\n", __func__, i);
1392402c 1301 rc = PTR_ERR(req);
c56c65fb
TT
1302 goto out;
1303 }
c56c65fb 1304 buf->rb_send_bufs[i] = req;
c56c65fb 1305
1392402c
CL
1306 rep = rpcrdma_create_rep(r_xprt);
1307 if (IS_ERR(rep)) {
c56c65fb
TT
1308 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1309 __func__, i);
1392402c 1310 rc = PTR_ERR(rep);
c56c65fb
TT
1311 goto out;
1312 }
c56c65fb 1313 buf->rb_recv_bufs[i] = rep;
c56c65fb 1314 }
1392402c 1315
c56c65fb
TT
1316 return 0;
1317out:
1318 rpcrdma_buffer_destroy(buf);
1319 return rc;
1320}
1321
1392402c
CL
1322static void
1323rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1324{
1325 if (!rep)
1326 return;
1327
1328 rpcrdma_deregister_internal(ia, rep->rr_handle, &rep->rr_iov);
1329 kfree(rep);
1330}
1331
1332static void
1333rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1334{
1335 if (!req)
1336 return;
1337
1338 rpcrdma_deregister_internal(ia, req->rl_handle, &req->rl_iov);
1339 kfree(req);
1340}
1341
2e84522c
CL
1342static void
1343rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1344{
1345 struct rpcrdma_mw *r;
1346 int rc;
1347
1348 while (!list_empty(&buf->rb_all)) {
1349 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1350 list_del(&r->mw_all);
1351 list_del(&r->mw_list);
1352
1353 rc = ib_dealloc_fmr(r->r.fmr);
1354 if (rc)
1355 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1356 __func__, rc);
1357
1358 kfree(r);
1359 }
1360}
1361
1362static void
1363rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1364{
1365 struct rpcrdma_mw *r;
1366 int rc;
1367
1368 while (!list_empty(&buf->rb_all)) {
1369 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1370 list_del(&r->mw_all);
1371 list_del(&r->mw_list);
1372
1373 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1374 if (rc)
1375 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1376 __func__, rc);
1377 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1378
1379 kfree(r);
1380 }
1381}
1382
c56c65fb
TT
1383void
1384rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1385{
c56c65fb 1386 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
2e84522c 1387 int i;
c56c65fb
TT
1388
1389 /* clean up in reverse order from create
1390 * 1. recv mr memory (mr free, then kfree)
c56c65fb 1391 * 2. send mr memory (mr free, then kfree)
2e84522c 1392 * 3. MWs
c56c65fb
TT
1393 */
1394 dprintk("RPC: %s: entering\n", __func__);
1395
1396 for (i = 0; i < buf->rb_max_requests; i++) {
1392402c
CL
1397 if (buf->rb_recv_bufs)
1398 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1399 if (buf->rb_send_bufs)
1400 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
c56c65fb
TT
1401 }
1402
2e84522c
CL
1403 switch (ia->ri_memreg_strategy) {
1404 case RPCRDMA_FRMR:
1405 rpcrdma_destroy_frmrs(buf);
1406 break;
1407 case RPCRDMA_MTHCAFMR:
1408 rpcrdma_destroy_fmrs(buf);
1409 break;
1410 default:
1411 break;
4034ba04
AA
1412 }
1413
c56c65fb
TT
1414 kfree(buf->rb_pool);
1415}
1416
467c9674
CL
1417/* After a disconnect, unmap all FMRs.
1418 *
1419 * This is invoked only in the transport connect worker in order
1420 * to serialize with rpcrdma_register_fmr_external().
1421 */
1422static void
1423rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1424{
1425 struct rpcrdma_xprt *r_xprt =
1426 container_of(ia, struct rpcrdma_xprt, rx_ia);
1427 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1428 struct list_head *pos;
1429 struct rpcrdma_mw *r;
1430 LIST_HEAD(l);
1431 int rc;
1432
1433 list_for_each(pos, &buf->rb_all) {
1434 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1435
1436 INIT_LIST_HEAD(&l);
1437 list_add(&r->r.fmr->list, &l);
1438 rc = ib_unmap_fmr(&l);
1439 if (rc)
1440 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1441 __func__, rc);
1442 }
1443}
1444
9f9d802a
CL
1445/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1446 * an unusable state. Find FRMRs in this state and dereg / reg
1447 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1448 * also torn down.
1449 *
1450 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1451 *
1452 * This is invoked only in the transport connect worker in order
1453 * to serialize with rpcrdma_register_frmr_external().
1454 */
1455static void
1456rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1457{
1458 struct rpcrdma_xprt *r_xprt =
1459 container_of(ia, struct rpcrdma_xprt, rx_ia);
1460 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1461 struct list_head *pos;
1462 struct rpcrdma_mw *r;
1463 int rc;
1464
1465 list_for_each(pos, &buf->rb_all) {
1466 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1467
1468 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1469 continue;
1470
1471 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1472 if (rc)
1473 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1474 __func__, rc);
1475 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1476
1477 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1478 ia->ri_max_frmr_depth);
1479 if (IS_ERR(r->r.frmr.fr_mr)) {
1480 rc = PTR_ERR(r->r.frmr.fr_mr);
1481 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1482 " failed %i\n", __func__, rc);
1483 continue;
1484 }
1485 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1486 ia->ri_id->device,
1487 ia->ri_max_frmr_depth);
1488 if (IS_ERR(r->r.frmr.fr_pgl)) {
1489 rc = PTR_ERR(r->r.frmr.fr_pgl);
1490 dprintk("RPC: %s: "
1491 "ib_alloc_fast_reg_page_list "
1492 "failed %i\n", __func__, rc);
1493
1494 ib_dereg_mr(r->r.frmr.fr_mr);
1495 continue;
1496 }
1497 r->r.frmr.fr_state = FRMR_IS_INVALID;
1498 }
1499}
1500
c2922c02
CL
1501/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1502 * some req segments uninitialized.
1503 */
1504static void
1505rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1506{
1507 if (*mw) {
1508 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1509 *mw = NULL;
1510 }
1511}
1512
1513/* Cycle mw's back in reverse order, and "spin" them.
1514 * This delays and scrambles reuse as much as possible.
1515 */
1516static void
1517rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1518{
1519 struct rpcrdma_mr_seg *seg = req->rl_segments;
1520 struct rpcrdma_mr_seg *seg1 = seg;
1521 int i;
1522
1523 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
3eb35810
CL
1524 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1525 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
c2922c02
CL
1526}
1527
1528static void
1529rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1530{
1531 buf->rb_send_bufs[--buf->rb_send_index] = req;
1532 req->rl_niovs = 0;
1533 if (req->rl_reply) {
1534 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1535 req->rl_reply->rr_func = NULL;
1536 req->rl_reply = NULL;
1537 }
1538}
1539
ddb6bebc
CL
1540/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1541 * Redo only the ib_post_send().
1542 */
1543static void
1544rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1545{
1546 struct rpcrdma_xprt *r_xprt =
1547 container_of(ia, struct rpcrdma_xprt, rx_ia);
1548 struct ib_send_wr invalidate_wr, *bad_wr;
1549 int rc;
1550
1551 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1552
1553 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
dab7e3b8 1554 r->r.frmr.fr_state = FRMR_IS_INVALID;
ddb6bebc
CL
1555
1556 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1557 invalidate_wr.wr_id = (unsigned long)(void *)r;
1558 invalidate_wr.opcode = IB_WR_LOCAL_INV;
ddb6bebc
CL
1559 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1560 DECR_CQCOUNT(&r_xprt->rx_ep);
1561
1562 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1563 __func__, r, r->r.frmr.fr_mr->rkey);
1564
1565 read_lock(&ia->ri_qplock);
1566 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1567 read_unlock(&ia->ri_qplock);
1568 if (rc) {
1569 /* Force rpcrdma_buffer_get() to retry */
1570 r->r.frmr.fr_state = FRMR_IS_STALE;
1571 dprintk("RPC: %s: ib_post_send failed, %i\n",
1572 __func__, rc);
1573 }
1574}
1575
1576static void
1577rpcrdma_retry_flushed_linv(struct list_head *stale,
1578 struct rpcrdma_buffer *buf)
1579{
1580 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1581 struct list_head *pos;
1582 struct rpcrdma_mw *r;
1583 unsigned long flags;
1584
1585 list_for_each(pos, stale) {
1586 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1587 rpcrdma_retry_local_inv(r, ia);
1588 }
1589
1590 spin_lock_irqsave(&buf->rb_lock, flags);
1591 list_splice_tail(stale, &buf->rb_mws);
1592 spin_unlock_irqrestore(&buf->rb_lock, flags);
1593}
1594
1595static struct rpcrdma_req *
1596rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1597 struct list_head *stale)
1598{
1599 struct rpcrdma_mw *r;
1600 int i;
1601
1602 i = RPCRDMA_MAX_SEGS - 1;
1603 while (!list_empty(&buf->rb_mws)) {
1604 r = list_entry(buf->rb_mws.next,
1605 struct rpcrdma_mw, mw_list);
1606 list_del(&r->mw_list);
1607 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1608 list_add(&r->mw_list, stale);
1609 continue;
1610 }
3eb35810 1611 req->rl_segments[i].rl_mw = r;
ddb6bebc
CL
1612 if (unlikely(i-- == 0))
1613 return req; /* Success */
1614 }
1615
1616 /* Not enough entries on rb_mws for this req */
1617 rpcrdma_buffer_put_sendbuf(req, buf);
1618 rpcrdma_buffer_put_mrs(req, buf);
1619 return NULL;
1620}
1621
c2922c02 1622static struct rpcrdma_req *
ddb6bebc 1623rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
c2922c02
CL
1624{
1625 struct rpcrdma_mw *r;
1626 int i;
1627
1628 i = RPCRDMA_MAX_SEGS - 1;
1629 while (!list_empty(&buf->rb_mws)) {
1630 r = list_entry(buf->rb_mws.next,
1631 struct rpcrdma_mw, mw_list);
1632 list_del(&r->mw_list);
3eb35810 1633 req->rl_segments[i].rl_mw = r;
c2922c02
CL
1634 if (unlikely(i-- == 0))
1635 return req; /* Success */
1636 }
1637
1638 /* Not enough entries on rb_mws for this req */
1639 rpcrdma_buffer_put_sendbuf(req, buf);
1640 rpcrdma_buffer_put_mrs(req, buf);
1641 return NULL;
1642}
1643
c56c65fb
TT
1644/*
1645 * Get a set of request/reply buffers.
1646 *
1647 * Reply buffer (if needed) is attached to send buffer upon return.
1648 * Rule:
1649 * rb_send_index and rb_recv_index MUST always be pointing to the
1650 * *next* available buffer (non-NULL). They are incremented after
1651 * removing buffers, and decremented *before* returning them.
1652 */
1653struct rpcrdma_req *
1654rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1655{
c2922c02 1656 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
ddb6bebc 1657 struct list_head stale;
c56c65fb
TT
1658 struct rpcrdma_req *req;
1659 unsigned long flags;
1660
1661 spin_lock_irqsave(&buffers->rb_lock, flags);
1662 if (buffers->rb_send_index == buffers->rb_max_requests) {
1663 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1664 dprintk("RPC: %s: out of request buffers\n", __func__);
1665 return ((struct rpcrdma_req *)NULL);
1666 }
1667
1668 req = buffers->rb_send_bufs[buffers->rb_send_index];
1669 if (buffers->rb_send_index < buffers->rb_recv_index) {
1670 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1671 __func__,
1672 buffers->rb_recv_index - buffers->rb_send_index);
1673 req->rl_reply = NULL;
1674 } else {
1675 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1676 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1677 }
1678 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
ddb6bebc
CL
1679
1680 INIT_LIST_HEAD(&stale);
c2922c02
CL
1681 switch (ia->ri_memreg_strategy) {
1682 case RPCRDMA_FRMR:
ddb6bebc
CL
1683 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1684 break;
c2922c02 1685 case RPCRDMA_MTHCAFMR:
ddb6bebc 1686 req = rpcrdma_buffer_get_fmrs(req, buffers);
c2922c02
CL
1687 break;
1688 default:
1689 break;
c56c65fb
TT
1690 }
1691 spin_unlock_irqrestore(&buffers->rb_lock, flags);
ddb6bebc
CL
1692 if (!list_empty(&stale))
1693 rpcrdma_retry_flushed_linv(&stale, buffers);
c56c65fb
TT
1694 return req;
1695}
1696
1697/*
1698 * Put request/reply buffers back into pool.
1699 * Pre-decrement counter/array index.
1700 */
1701void
1702rpcrdma_buffer_put(struct rpcrdma_req *req)
1703{
1704 struct rpcrdma_buffer *buffers = req->rl_buffer;
1705 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
c56c65fb
TT
1706 unsigned long flags;
1707
c56c65fb 1708 spin_lock_irqsave(&buffers->rb_lock, flags);
c2922c02 1709 rpcrdma_buffer_put_sendbuf(req, buffers);
c56c65fb 1710 switch (ia->ri_memreg_strategy) {
3197d309 1711 case RPCRDMA_FRMR:
c56c65fb 1712 case RPCRDMA_MTHCAFMR:
c2922c02 1713 rpcrdma_buffer_put_mrs(req, buffers);
c56c65fb
TT
1714 break;
1715 default:
1716 break;
1717 }
1718 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1719}
1720
1721/*
1722 * Recover reply buffers from pool.
1723 * This happens when recovering from error conditions.
1724 * Post-increment counter/array index.
1725 */
1726void
1727rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1728{
1729 struct rpcrdma_buffer *buffers = req->rl_buffer;
1730 unsigned long flags;
1731
1732 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1733 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1734 spin_lock_irqsave(&buffers->rb_lock, flags);
1735 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1736 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1737 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1738 }
1739 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1740}
1741
1742/*
1743 * Put reply buffers back into pool when not attached to
b45ccfd2 1744 * request. This happens in error conditions.
c56c65fb
TT
1745 */
1746void
1747rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1748{
1749 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1750 unsigned long flags;
1751
1752 rep->rr_func = NULL;
1753 spin_lock_irqsave(&buffers->rb_lock, flags);
1754 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1755 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1756}
1757
1758/*
1759 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1760 */
1761
1762int
1763rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1764 struct ib_mr **mrp, struct ib_sge *iov)
1765{
1766 struct ib_phys_buf ipb;
1767 struct ib_mr *mr;
1768 int rc;
1769
1770 /*
1771 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1772 */
1773 iov->addr = ib_dma_map_single(ia->ri_id->device,
1774 va, len, DMA_BIDIRECTIONAL);
bf858ab0
YB
1775 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1776 return -ENOMEM;
1777
c56c65fb
TT
1778 iov->length = len;
1779
bd7ed1d1
TT
1780 if (ia->ri_have_dma_lkey) {
1781 *mrp = NULL;
1782 iov->lkey = ia->ri_dma_lkey;
1783 return 0;
1784 } else if (ia->ri_bind_mem != NULL) {
c56c65fb
TT
1785 *mrp = NULL;
1786 iov->lkey = ia->ri_bind_mem->lkey;
1787 return 0;
1788 }
1789
1790 ipb.addr = iov->addr;
1791 ipb.size = iov->length;
1792 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1793 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1794
1795 dprintk("RPC: %s: phys convert: 0x%llx "
1796 "registered 0x%llx length %d\n",
a56daeb7
AM
1797 __func__, (unsigned long long)ipb.addr,
1798 (unsigned long long)iov->addr, len);
c56c65fb
TT
1799
1800 if (IS_ERR(mr)) {
1801 *mrp = NULL;
1802 rc = PTR_ERR(mr);
1803 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1804 } else {
1805 *mrp = mr;
1806 iov->lkey = mr->lkey;
1807 rc = 0;
1808 }
1809
1810 return rc;
1811}
1812
1813int
1814rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1815 struct ib_mr *mr, struct ib_sge *iov)
1816{
1817 int rc;
1818
1819 ib_dma_unmap_single(ia->ri_id->device,
1820 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1821
1822 if (NULL == mr)
1823 return 0;
1824
1825 rc = ib_dereg_mr(mr);
1826 if (rc)
1827 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1828 return rc;
1829}
1830
1831/*
1832 * Wrappers for chunk registration, shared by read/write chunk code.
1833 */
1834
1835static void
1836rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1837{
1838 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1839 seg->mr_dmalen = seg->mr_len;
1840 if (seg->mr_page)
1841 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1842 seg->mr_page, offset_in_page(seg->mr_offset),
1843 seg->mr_dmalen, seg->mr_dir);
1844 else
1845 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1846 seg->mr_offset,
1847 seg->mr_dmalen, seg->mr_dir);
5c635e09
TT
1848 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1849 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1850 __func__,
986d4abb
RD
1851 (unsigned long long)seg->mr_dma,
1852 seg->mr_offset, seg->mr_dmalen);
5c635e09 1853 }
c56c65fb
TT
1854}
1855
1856static void
1857rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1858{
1859 if (seg->mr_page)
1860 ib_dma_unmap_page(ia->ri_id->device,
1861 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1862 else
1863 ib_dma_unmap_single(ia->ri_id->device,
1864 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1865}
1866
3197d309
TT
1867static int
1868rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1869 int *nsegs, int writing, struct rpcrdma_ia *ia,
1870 struct rpcrdma_xprt *r_xprt)
1871{
1872 struct rpcrdma_mr_seg *seg1 = seg;
3eb35810 1873 struct rpcrdma_mw *mw = seg1->rl_mw;
0dbb4108
CL
1874 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1875 struct ib_mr *mr = frmr->fr_mr;
f590e878 1876 struct ib_send_wr fastreg_wr, *bad_wr;
3197d309
TT
1877 u8 key;
1878 int len, pageoff;
1879 int i, rc;
9b78145c
TT
1880 int seg_len;
1881 u64 pa;
1882 int page_no;
3197d309
TT
1883
1884 pageoff = offset_in_page(seg1->mr_offset);
1885 seg1->mr_offset -= pageoff; /* start of page */
1886 seg1->mr_len += pageoff;
1887 len = -pageoff;
0fc6c4e7
SW
1888 if (*nsegs > ia->ri_max_frmr_depth)
1889 *nsegs = ia->ri_max_frmr_depth;
9b78145c 1890 for (page_no = i = 0; i < *nsegs;) {
3197d309 1891 rpcrdma_map_one(ia, seg, writing);
9b78145c
TT
1892 pa = seg->mr_dma;
1893 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
0dbb4108 1894 frmr->fr_pgl->page_list[page_no++] = pa;
9b78145c
TT
1895 pa += PAGE_SIZE;
1896 }
3197d309
TT
1897 len += seg->mr_len;
1898 ++seg;
1899 ++i;
1900 /* Check for holes */
1901 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1902 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1903 break;
1904 }
1905 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
0dbb4108 1906 __func__, mw, i);
3197d309 1907
05055722
CL
1908 frmr->fr_state = FRMR_IS_VALID;
1909
f590e878
CL
1910 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1911 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1912 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1913 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1914 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1915 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1916 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1917 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1918 if (fastreg_wr.wr.fast_reg.length < len) {
5fc83f47
CL
1919 rc = -EIO;
1920 goto out_err;
c977dea2
CL
1921 }
1922
1923 /* Bump the key */
0dbb4108
CL
1924 key = (u8)(mr->rkey & 0x000000FF);
1925 ib_update_fast_reg_key(mr, ++key);
c977dea2 1926
f590e878 1927 fastreg_wr.wr.fast_reg.access_flags = (writing ?
68743082
VP
1928 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1929 IB_ACCESS_REMOTE_READ);
f590e878 1930 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
3197d309
TT
1931 DECR_CQCOUNT(&r_xprt->rx_ep);
1932
f590e878 1933 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
3197d309
TT
1934 if (rc) {
1935 dprintk("RPC: %s: failed ib_post_send for register,"
1936 " status %i\n", __func__, rc);
c93e986a 1937 ib_update_fast_reg_key(mr, --key);
5fc83f47 1938 goto out_err;
3197d309 1939 } else {
0dbb4108 1940 seg1->mr_rkey = mr->rkey;
3197d309
TT
1941 seg1->mr_base = seg1->mr_dma + pageoff;
1942 seg1->mr_nsegs = i;
1943 seg1->mr_len = len;
1944 }
1945 *nsegs = i;
5fc83f47
CL
1946 return 0;
1947out_err:
05055722 1948 frmr->fr_state = FRMR_IS_INVALID;
5fc83f47
CL
1949 while (i--)
1950 rpcrdma_unmap_one(ia, --seg);
3197d309
TT
1951 return rc;
1952}
1953
1954static int
1955rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1956 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1957{
1958 struct rpcrdma_mr_seg *seg1 = seg;
1959 struct ib_send_wr invalidate_wr, *bad_wr;
1960 int rc;
1961
3eb35810 1962 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
dab7e3b8 1963
3197d309 1964 memset(&invalidate_wr, 0, sizeof invalidate_wr);
3eb35810 1965 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
3197d309 1966 invalidate_wr.opcode = IB_WR_LOCAL_INV;
3eb35810 1967 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
3197d309
TT
1968 DECR_CQCOUNT(&r_xprt->rx_ep);
1969
73806c88
CL
1970 read_lock(&ia->ri_qplock);
1971 while (seg1->mr_nsegs--)
1972 rpcrdma_unmap_one(ia, seg++);
3197d309 1973 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
73806c88 1974 read_unlock(&ia->ri_qplock);
dab7e3b8
CL
1975 if (rc) {
1976 /* Force rpcrdma_buffer_get() to retry */
3eb35810 1977 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
3197d309
TT
1978 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1979 " status %i\n", __func__, rc);
dab7e3b8 1980 }
3197d309
TT
1981 return rc;
1982}
1983
8d4ba034
TT
1984static int
1985rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1986 int *nsegs, int writing, struct rpcrdma_ia *ia)
1987{
1988 struct rpcrdma_mr_seg *seg1 = seg;
1989 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1990 int len, pageoff, i, rc;
1991
1992 pageoff = offset_in_page(seg1->mr_offset);
1993 seg1->mr_offset -= pageoff; /* start of page */
1994 seg1->mr_len += pageoff;
1995 len = -pageoff;
1996 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1997 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1998 for (i = 0; i < *nsegs;) {
1999 rpcrdma_map_one(ia, seg, writing);
2000 physaddrs[i] = seg->mr_dma;
2001 len += seg->mr_len;
2002 ++seg;
2003 ++i;
2004 /* Check for holes */
2005 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2006 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2007 break;
2008 }
3eb35810 2009 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
8d4ba034
TT
2010 if (rc) {
2011 dprintk("RPC: %s: failed ib_map_phys_fmr "
2012 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2013 len, (unsigned long long)seg1->mr_dma,
2014 pageoff, i, rc);
2015 while (i--)
2016 rpcrdma_unmap_one(ia, --seg);
2017 } else {
3eb35810 2018 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
8d4ba034
TT
2019 seg1->mr_base = seg1->mr_dma + pageoff;
2020 seg1->mr_nsegs = i;
2021 seg1->mr_len = len;
2022 }
2023 *nsegs = i;
2024 return rc;
2025}
2026
2027static int
2028rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2029 struct rpcrdma_ia *ia)
2030{
2031 struct rpcrdma_mr_seg *seg1 = seg;
2032 LIST_HEAD(l);
2033 int rc;
2034
3eb35810 2035 list_add(&seg1->rl_mw->r.fmr->list, &l);
8d4ba034 2036 rc = ib_unmap_fmr(&l);
73806c88 2037 read_lock(&ia->ri_qplock);
8d4ba034
TT
2038 while (seg1->mr_nsegs--)
2039 rpcrdma_unmap_one(ia, seg++);
73806c88 2040 read_unlock(&ia->ri_qplock);
8d4ba034
TT
2041 if (rc)
2042 dprintk("RPC: %s: failed ib_unmap_fmr,"
2043 " status %i\n", __func__, rc);
2044 return rc;
2045}
2046
c56c65fb
TT
2047int
2048rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2049 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2050{
2051 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
2052 int rc = 0;
2053
2054 switch (ia->ri_memreg_strategy) {
2055
c56c65fb
TT
2056 case RPCRDMA_ALLPHYSICAL:
2057 rpcrdma_map_one(ia, seg, writing);
2058 seg->mr_rkey = ia->ri_bind_mem->rkey;
2059 seg->mr_base = seg->mr_dma;
2060 seg->mr_nsegs = 1;
2061 nsegs = 1;
2062 break;
c56c65fb 2063
3197d309
TT
2064 /* Registration using frmr registration */
2065 case RPCRDMA_FRMR:
2066 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2067 break;
2068
8d4ba034 2069 /* Registration using fmr memory registration */
c56c65fb 2070 case RPCRDMA_MTHCAFMR:
8d4ba034 2071 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
c56c65fb
TT
2072 break;
2073
c56c65fb 2074 default:
92b98361 2075 return -EIO;
c56c65fb
TT
2076 }
2077 if (rc)
92b98361 2078 return rc;
c56c65fb
TT
2079
2080 return nsegs;
2081}
2082
2083int
2084rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
13c9ff8f 2085 struct rpcrdma_xprt *r_xprt)
c56c65fb
TT
2086{
2087 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
c56c65fb
TT
2088 int nsegs = seg->mr_nsegs, rc;
2089
2090 switch (ia->ri_memreg_strategy) {
2091
c56c65fb 2092 case RPCRDMA_ALLPHYSICAL:
73806c88 2093 read_lock(&ia->ri_qplock);
c56c65fb 2094 rpcrdma_unmap_one(ia, seg);
73806c88 2095 read_unlock(&ia->ri_qplock);
c56c65fb 2096 break;
c56c65fb 2097
3197d309
TT
2098 case RPCRDMA_FRMR:
2099 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2100 break;
2101
c56c65fb 2102 case RPCRDMA_MTHCAFMR:
8d4ba034 2103 rc = rpcrdma_deregister_fmr_external(seg, ia);
c56c65fb
TT
2104 break;
2105
c56c65fb 2106 default:
c56c65fb
TT
2107 break;
2108 }
c56c65fb
TT
2109 return nsegs;
2110}
2111
2112/*
2113 * Prepost any receive buffer, then post send.
2114 *
2115 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2116 */
2117int
2118rpcrdma_ep_post(struct rpcrdma_ia *ia,
2119 struct rpcrdma_ep *ep,
2120 struct rpcrdma_req *req)
2121{
2122 struct ib_send_wr send_wr, *send_wr_fail;
2123 struct rpcrdma_rep *rep = req->rl_reply;
2124 int rc;
2125
2126 if (rep) {
2127 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2128 if (rc)
2129 goto out;
2130 req->rl_reply = NULL;
2131 }
2132
2133 send_wr.next = NULL;
2134 send_wr.wr_id = 0ULL; /* no send cookie */
2135 send_wr.sg_list = req->rl_send_iov;
2136 send_wr.num_sge = req->rl_niovs;
2137 send_wr.opcode = IB_WR_SEND;
c56c65fb
TT
2138 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2139 ib_dma_sync_single_for_device(ia->ri_id->device,
2140 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2141 DMA_TO_DEVICE);
2142 ib_dma_sync_single_for_device(ia->ri_id->device,
2143 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2144 DMA_TO_DEVICE);
2145 ib_dma_sync_single_for_device(ia->ri_id->device,
2146 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2147 DMA_TO_DEVICE);
2148
2149 if (DECR_CQCOUNT(ep) > 0)
2150 send_wr.send_flags = 0;
2151 else { /* Provider must take a send completion every now and then */
2152 INIT_CQCOUNT(ep);
2153 send_wr.send_flags = IB_SEND_SIGNALED;
2154 }
2155
2156 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2157 if (rc)
2158 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2159 rc);
2160out:
2161 return rc;
2162}
2163
2164/*
2165 * (Re)post a receive buffer.
2166 */
2167int
2168rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2169 struct rpcrdma_ep *ep,
2170 struct rpcrdma_rep *rep)
2171{
2172 struct ib_recv_wr recv_wr, *recv_wr_fail;
2173 int rc;
2174
2175 recv_wr.next = NULL;
2176 recv_wr.wr_id = (u64) (unsigned long) rep;
2177 recv_wr.sg_list = &rep->rr_iov;
2178 recv_wr.num_sge = 1;
2179
2180 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2181 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
2182
c56c65fb
TT
2183 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2184
2185 if (rc)
2186 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2187 rc);
2188 return rc;
2189}
43e95988
CL
2190
2191/* Physical mapping means one Read/Write list entry per-page.
2192 * All list entries must fit within an inline buffer
2193 *
2194 * NB: The server must return a Write list for NFS READ,
2195 * which has the same constraint. Factor in the inline
2196 * rsize as well.
2197 */
2198static size_t
2199rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2200{
2201 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2202 unsigned int inline_size, pages;
2203
2204 inline_size = min_t(unsigned int,
2205 cdata->inline_wsize, cdata->inline_rsize);
2206 inline_size -= RPCRDMA_HDRLEN_MIN;
2207 pages = inline_size / sizeof(struct rpcrdma_segment);
2208 return pages << PAGE_SHIFT;
2209}
2210
2211static size_t
2212rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2213{
2214 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2215}
2216
2217size_t
2218rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2219{
2220 size_t result;
2221
2222 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2223 case RPCRDMA_ALLPHYSICAL:
2224 result = rpcrdma_physical_max_payload(r_xprt);
2225 break;
2226 default:
2227 result = rpcrdma_mr_max_payload(r_xprt);
2228 }
2229 return result;
2230}