]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - net/sunrpc/xprtrdma/verbs.c
Merge tag 'clk-for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/clk...
[mirror_ubuntu-artful-kernel.git] / net / sunrpc / xprtrdma / verbs.c
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
21 *
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * verbs.c
42 *
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
48 */
49
50 #include <linux/interrupt.h>
51 #include <linux/slab.h>
52 #include <linux/prefetch.h>
53 #include <asm/bitops.h>
54
55 #include "xprt_rdma.h"
56
57 /*
58 * Globals/Macros
59 */
60
61 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
62 # define RPCDBG_FACILITY RPCDBG_TRANS
63 #endif
64
65 static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
66 static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
67
68 /*
69 * internal functions
70 */
71
72 /*
73 * handle replies in tasklet context, using a single, global list
74 * rdma tasklet function -- just turn around and call the func
75 * for all replies on the list
76 */
77
78 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
79 static LIST_HEAD(rpcrdma_tasklets_g);
80
81 static void
82 rpcrdma_run_tasklet(unsigned long data)
83 {
84 struct rpcrdma_rep *rep;
85 void (*func)(struct rpcrdma_rep *);
86 unsigned long flags;
87
88 data = data;
89 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
90 while (!list_empty(&rpcrdma_tasklets_g)) {
91 rep = list_entry(rpcrdma_tasklets_g.next,
92 struct rpcrdma_rep, rr_list);
93 list_del(&rep->rr_list);
94 func = rep->rr_func;
95 rep->rr_func = NULL;
96 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
97
98 if (func)
99 func(rep);
100 else
101 rpcrdma_recv_buffer_put(rep);
102
103 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
104 }
105 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
106 }
107
108 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
109
110 static const char * const async_event[] = {
111 "CQ error",
112 "QP fatal error",
113 "QP request error",
114 "QP access error",
115 "communication established",
116 "send queue drained",
117 "path migration successful",
118 "path mig error",
119 "device fatal error",
120 "port active",
121 "port error",
122 "LID change",
123 "P_key change",
124 "SM change",
125 "SRQ error",
126 "SRQ limit reached",
127 "last WQE reached",
128 "client reregister",
129 "GID change",
130 };
131
132 #define ASYNC_MSG(status) \
133 ((status) < ARRAY_SIZE(async_event) ? \
134 async_event[(status)] : "unknown async error")
135
136 static void
137 rpcrdma_schedule_tasklet(struct list_head *sched_list)
138 {
139 unsigned long flags;
140
141 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
142 list_splice_tail(sched_list, &rpcrdma_tasklets_g);
143 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
144 tasklet_schedule(&rpcrdma_tasklet_g);
145 }
146
147 static void
148 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
149 {
150 struct rpcrdma_ep *ep = context;
151
152 pr_err("RPC: %s: %s on device %s ep %p\n",
153 __func__, ASYNC_MSG(event->event),
154 event->device->name, context);
155 if (ep->rep_connected == 1) {
156 ep->rep_connected = -EIO;
157 rpcrdma_conn_func(ep);
158 wake_up_all(&ep->rep_connect_wait);
159 }
160 }
161
162 static void
163 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
164 {
165 struct rpcrdma_ep *ep = context;
166
167 pr_err("RPC: %s: %s on device %s ep %p\n",
168 __func__, ASYNC_MSG(event->event),
169 event->device->name, context);
170 if (ep->rep_connected == 1) {
171 ep->rep_connected = -EIO;
172 rpcrdma_conn_func(ep);
173 wake_up_all(&ep->rep_connect_wait);
174 }
175 }
176
177 static const char * const wc_status[] = {
178 "success",
179 "local length error",
180 "local QP operation error",
181 "local EE context operation error",
182 "local protection error",
183 "WR flushed",
184 "memory management operation error",
185 "bad response error",
186 "local access error",
187 "remote invalid request error",
188 "remote access error",
189 "remote operation error",
190 "transport retry counter exceeded",
191 "RNR retrycounter exceeded",
192 "local RDD violation error",
193 "remove invalid RD request",
194 "operation aborted",
195 "invalid EE context number",
196 "invalid EE context state",
197 "fatal error",
198 "response timeout error",
199 "general error",
200 };
201
202 #define COMPLETION_MSG(status) \
203 ((status) < ARRAY_SIZE(wc_status) ? \
204 wc_status[(status)] : "unexpected completion error")
205
206 static void
207 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
208 {
209 if (likely(wc->status == IB_WC_SUCCESS))
210 return;
211
212 /* WARNING: Only wr_id and status are reliable at this point */
213 if (wc->wr_id == 0ULL) {
214 if (wc->status != IB_WC_WR_FLUSH_ERR)
215 pr_err("RPC: %s: SEND: %s\n",
216 __func__, COMPLETION_MSG(wc->status));
217 } else {
218 struct rpcrdma_mw *r;
219
220 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
221 r->r.frmr.fr_state = FRMR_IS_STALE;
222 pr_err("RPC: %s: frmr %p (stale): %s\n",
223 __func__, r, COMPLETION_MSG(wc->status));
224 }
225 }
226
227 static int
228 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
229 {
230 struct ib_wc *wcs;
231 int budget, count, rc;
232
233 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
234 do {
235 wcs = ep->rep_send_wcs;
236
237 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
238 if (rc <= 0)
239 return rc;
240
241 count = rc;
242 while (count-- > 0)
243 rpcrdma_sendcq_process_wc(wcs++);
244 } while (rc == RPCRDMA_POLLSIZE && --budget);
245 return 0;
246 }
247
248 /*
249 * Handle send, fast_reg_mr, and local_inv completions.
250 *
251 * Send events are typically suppressed and thus do not result
252 * in an upcall. Occasionally one is signaled, however. This
253 * prevents the provider's completion queue from wrapping and
254 * losing a completion.
255 */
256 static void
257 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
258 {
259 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
260 int rc;
261
262 rc = rpcrdma_sendcq_poll(cq, ep);
263 if (rc) {
264 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
265 __func__, rc);
266 return;
267 }
268
269 rc = ib_req_notify_cq(cq,
270 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
271 if (rc == 0)
272 return;
273 if (rc < 0) {
274 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
275 __func__, rc);
276 return;
277 }
278
279 rpcrdma_sendcq_poll(cq, ep);
280 }
281
282 static void
283 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
284 {
285 struct rpcrdma_rep *rep =
286 (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
287
288 /* WARNING: Only wr_id and status are reliable at this point */
289 if (wc->status != IB_WC_SUCCESS)
290 goto out_fail;
291
292 /* status == SUCCESS means all fields in wc are trustworthy */
293 if (wc->opcode != IB_WC_RECV)
294 return;
295
296 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
297 __func__, rep, wc->byte_len);
298
299 rep->rr_len = wc->byte_len;
300 ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
301 rdmab_addr(rep->rr_rdmabuf),
302 rep->rr_len, DMA_FROM_DEVICE);
303 prefetch(rdmab_to_msg(rep->rr_rdmabuf));
304
305 out_schedule:
306 list_add_tail(&rep->rr_list, sched_list);
307 return;
308 out_fail:
309 if (wc->status != IB_WC_WR_FLUSH_ERR)
310 pr_err("RPC: %s: rep %p: %s\n",
311 __func__, rep, COMPLETION_MSG(wc->status));
312 rep->rr_len = ~0U;
313 goto out_schedule;
314 }
315
316 static int
317 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
318 {
319 struct list_head sched_list;
320 struct ib_wc *wcs;
321 int budget, count, rc;
322
323 INIT_LIST_HEAD(&sched_list);
324 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
325 do {
326 wcs = ep->rep_recv_wcs;
327
328 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
329 if (rc <= 0)
330 goto out_schedule;
331
332 count = rc;
333 while (count-- > 0)
334 rpcrdma_recvcq_process_wc(wcs++, &sched_list);
335 } while (rc == RPCRDMA_POLLSIZE && --budget);
336 rc = 0;
337
338 out_schedule:
339 rpcrdma_schedule_tasklet(&sched_list);
340 return rc;
341 }
342
343 /*
344 * Handle receive completions.
345 *
346 * It is reentrant but processes single events in order to maintain
347 * ordering of receives to keep server credits.
348 *
349 * It is the responsibility of the scheduled tasklet to return
350 * recv buffers to the pool. NOTE: this affects synchronization of
351 * connection shutdown. That is, the structures required for
352 * the completion of the reply handler must remain intact until
353 * all memory has been reclaimed.
354 */
355 static void
356 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
357 {
358 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
359 int rc;
360
361 rc = rpcrdma_recvcq_poll(cq, ep);
362 if (rc) {
363 dprintk("RPC: %s: ib_poll_cq failed: %i\n",
364 __func__, rc);
365 return;
366 }
367
368 rc = ib_req_notify_cq(cq,
369 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
370 if (rc == 0)
371 return;
372 if (rc < 0) {
373 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
374 __func__, rc);
375 return;
376 }
377
378 rpcrdma_recvcq_poll(cq, ep);
379 }
380
381 static void
382 rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
383 {
384 struct ib_wc wc;
385 LIST_HEAD(sched_list);
386
387 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
388 rpcrdma_recvcq_process_wc(&wc, &sched_list);
389 if (!list_empty(&sched_list))
390 rpcrdma_schedule_tasklet(&sched_list);
391 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
392 rpcrdma_sendcq_process_wc(&wc);
393 }
394
395 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
396 static const char * const conn[] = {
397 "address resolved",
398 "address error",
399 "route resolved",
400 "route error",
401 "connect request",
402 "connect response",
403 "connect error",
404 "unreachable",
405 "rejected",
406 "established",
407 "disconnected",
408 "device removal",
409 "multicast join",
410 "multicast error",
411 "address change",
412 "timewait exit",
413 };
414
415 #define CONNECTION_MSG(status) \
416 ((status) < ARRAY_SIZE(conn) ? \
417 conn[(status)] : "unrecognized connection error")
418 #endif
419
420 static int
421 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
422 {
423 struct rpcrdma_xprt *xprt = id->context;
424 struct rpcrdma_ia *ia = &xprt->rx_ia;
425 struct rpcrdma_ep *ep = &xprt->rx_ep;
426 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
427 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
428 #endif
429 struct ib_qp_attr *attr = &ia->ri_qp_attr;
430 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
431 int connstate = 0;
432
433 switch (event->event) {
434 case RDMA_CM_EVENT_ADDR_RESOLVED:
435 case RDMA_CM_EVENT_ROUTE_RESOLVED:
436 ia->ri_async_rc = 0;
437 complete(&ia->ri_done);
438 break;
439 case RDMA_CM_EVENT_ADDR_ERROR:
440 ia->ri_async_rc = -EHOSTUNREACH;
441 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
442 __func__, ep);
443 complete(&ia->ri_done);
444 break;
445 case RDMA_CM_EVENT_ROUTE_ERROR:
446 ia->ri_async_rc = -ENETUNREACH;
447 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
448 __func__, ep);
449 complete(&ia->ri_done);
450 break;
451 case RDMA_CM_EVENT_ESTABLISHED:
452 connstate = 1;
453 ib_query_qp(ia->ri_id->qp, attr,
454 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
455 iattr);
456 dprintk("RPC: %s: %d responder resources"
457 " (%d initiator)\n",
458 __func__, attr->max_dest_rd_atomic,
459 attr->max_rd_atomic);
460 goto connected;
461 case RDMA_CM_EVENT_CONNECT_ERROR:
462 connstate = -ENOTCONN;
463 goto connected;
464 case RDMA_CM_EVENT_UNREACHABLE:
465 connstate = -ENETDOWN;
466 goto connected;
467 case RDMA_CM_EVENT_REJECTED:
468 connstate = -ECONNREFUSED;
469 goto connected;
470 case RDMA_CM_EVENT_DISCONNECTED:
471 connstate = -ECONNABORTED;
472 goto connected;
473 case RDMA_CM_EVENT_DEVICE_REMOVAL:
474 connstate = -ENODEV;
475 connected:
476 dprintk("RPC: %s: %sconnected\n",
477 __func__, connstate > 0 ? "" : "dis");
478 ep->rep_connected = connstate;
479 rpcrdma_conn_func(ep);
480 wake_up_all(&ep->rep_connect_wait);
481 /*FALLTHROUGH*/
482 default:
483 dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n",
484 __func__, &addr->sin_addr.s_addr,
485 ntohs(addr->sin_port), ep,
486 CONNECTION_MSG(event->event));
487 break;
488 }
489
490 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
491 if (connstate == 1) {
492 int ird = attr->max_dest_rd_atomic;
493 int tird = ep->rep_remote_cma.responder_resources;
494 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
495 "on %s, memreg %d slots %d ird %d%s\n",
496 &addr->sin_addr.s_addr,
497 ntohs(addr->sin_port),
498 ia->ri_id->device->name,
499 ia->ri_memreg_strategy,
500 xprt->rx_buf.rb_max_requests,
501 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
502 } else if (connstate < 0) {
503 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
504 &addr->sin_addr.s_addr,
505 ntohs(addr->sin_port),
506 connstate);
507 }
508 #endif
509
510 return 0;
511 }
512
513 static struct rdma_cm_id *
514 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
515 struct rpcrdma_ia *ia, struct sockaddr *addr)
516 {
517 struct rdma_cm_id *id;
518 int rc;
519
520 init_completion(&ia->ri_done);
521
522 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
523 if (IS_ERR(id)) {
524 rc = PTR_ERR(id);
525 dprintk("RPC: %s: rdma_create_id() failed %i\n",
526 __func__, rc);
527 return id;
528 }
529
530 ia->ri_async_rc = -ETIMEDOUT;
531 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
532 if (rc) {
533 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
534 __func__, rc);
535 goto out;
536 }
537 wait_for_completion_interruptible_timeout(&ia->ri_done,
538 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
539 rc = ia->ri_async_rc;
540 if (rc)
541 goto out;
542
543 ia->ri_async_rc = -ETIMEDOUT;
544 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
545 if (rc) {
546 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
547 __func__, rc);
548 goto out;
549 }
550 wait_for_completion_interruptible_timeout(&ia->ri_done,
551 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
552 rc = ia->ri_async_rc;
553 if (rc)
554 goto out;
555
556 return id;
557
558 out:
559 rdma_destroy_id(id);
560 return ERR_PTR(rc);
561 }
562
563 /*
564 * Drain any cq, prior to teardown.
565 */
566 static void
567 rpcrdma_clean_cq(struct ib_cq *cq)
568 {
569 struct ib_wc wc;
570 int count = 0;
571
572 while (1 == ib_poll_cq(cq, 1, &wc))
573 ++count;
574
575 if (count)
576 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
577 __func__, count, wc.opcode);
578 }
579
580 /*
581 * Exported functions.
582 */
583
584 /*
585 * Open and initialize an Interface Adapter.
586 * o initializes fields of struct rpcrdma_ia, including
587 * interface and provider attributes and protection zone.
588 */
589 int
590 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
591 {
592 int rc, mem_priv;
593 struct rpcrdma_ia *ia = &xprt->rx_ia;
594 struct ib_device_attr *devattr = &ia->ri_devattr;
595
596 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
597 if (IS_ERR(ia->ri_id)) {
598 rc = PTR_ERR(ia->ri_id);
599 goto out1;
600 }
601
602 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
603 if (IS_ERR(ia->ri_pd)) {
604 rc = PTR_ERR(ia->ri_pd);
605 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
606 __func__, rc);
607 goto out2;
608 }
609
610 rc = ib_query_device(ia->ri_id->device, devattr);
611 if (rc) {
612 dprintk("RPC: %s: ib_query_device failed %d\n",
613 __func__, rc);
614 goto out3;
615 }
616
617 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
618 ia->ri_have_dma_lkey = 1;
619 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
620 }
621
622 if (memreg == RPCRDMA_FRMR) {
623 /* Requires both frmr reg and local dma lkey */
624 if ((devattr->device_cap_flags &
625 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
626 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
627 dprintk("RPC: %s: FRMR registration "
628 "not supported by HCA\n", __func__);
629 memreg = RPCRDMA_MTHCAFMR;
630 } else {
631 /* Mind the ia limit on FRMR page list depth */
632 ia->ri_max_frmr_depth = min_t(unsigned int,
633 RPCRDMA_MAX_DATA_SEGS,
634 devattr->max_fast_reg_page_list_len);
635 }
636 }
637 if (memreg == RPCRDMA_MTHCAFMR) {
638 if (!ia->ri_id->device->alloc_fmr) {
639 dprintk("RPC: %s: MTHCAFMR registration "
640 "not supported by HCA\n", __func__);
641 memreg = RPCRDMA_ALLPHYSICAL;
642 }
643 }
644
645 /*
646 * Optionally obtain an underlying physical identity mapping in
647 * order to do a memory window-based bind. This base registration
648 * is protected from remote access - that is enabled only by binding
649 * for the specific bytes targeted during each RPC operation, and
650 * revoked after the corresponding completion similar to a storage
651 * adapter.
652 */
653 switch (memreg) {
654 case RPCRDMA_FRMR:
655 break;
656 case RPCRDMA_ALLPHYSICAL:
657 mem_priv = IB_ACCESS_LOCAL_WRITE |
658 IB_ACCESS_REMOTE_WRITE |
659 IB_ACCESS_REMOTE_READ;
660 goto register_setup;
661 case RPCRDMA_MTHCAFMR:
662 if (ia->ri_have_dma_lkey)
663 break;
664 mem_priv = IB_ACCESS_LOCAL_WRITE;
665 register_setup:
666 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
667 if (IS_ERR(ia->ri_bind_mem)) {
668 printk(KERN_ALERT "%s: ib_get_dma_mr for "
669 "phys register failed with %lX\n",
670 __func__, PTR_ERR(ia->ri_bind_mem));
671 rc = -ENOMEM;
672 goto out3;
673 }
674 break;
675 default:
676 printk(KERN_ERR "RPC: Unsupported memory "
677 "registration mode: %d\n", memreg);
678 rc = -ENOMEM;
679 goto out3;
680 }
681 dprintk("RPC: %s: memory registration strategy is %d\n",
682 __func__, memreg);
683
684 /* Else will do memory reg/dereg for each chunk */
685 ia->ri_memreg_strategy = memreg;
686
687 rwlock_init(&ia->ri_qplock);
688 return 0;
689
690 out3:
691 ib_dealloc_pd(ia->ri_pd);
692 ia->ri_pd = NULL;
693 out2:
694 rdma_destroy_id(ia->ri_id);
695 ia->ri_id = NULL;
696 out1:
697 return rc;
698 }
699
700 /*
701 * Clean up/close an IA.
702 * o if event handles and PD have been initialized, free them.
703 * o close the IA
704 */
705 void
706 rpcrdma_ia_close(struct rpcrdma_ia *ia)
707 {
708 int rc;
709
710 dprintk("RPC: %s: entering\n", __func__);
711 if (ia->ri_bind_mem != NULL) {
712 rc = ib_dereg_mr(ia->ri_bind_mem);
713 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
714 __func__, rc);
715 }
716 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
717 if (ia->ri_id->qp)
718 rdma_destroy_qp(ia->ri_id);
719 rdma_destroy_id(ia->ri_id);
720 ia->ri_id = NULL;
721 }
722 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
723 rc = ib_dealloc_pd(ia->ri_pd);
724 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
725 __func__, rc);
726 }
727 }
728
729 /*
730 * Create unconnected endpoint.
731 */
732 int
733 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
734 struct rpcrdma_create_data_internal *cdata)
735 {
736 struct ib_device_attr *devattr = &ia->ri_devattr;
737 struct ib_cq *sendcq, *recvcq;
738 int rc, err;
739
740 /* check provider's send/recv wr limits */
741 if (cdata->max_requests > devattr->max_qp_wr)
742 cdata->max_requests = devattr->max_qp_wr;
743
744 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
745 ep->rep_attr.qp_context = ep;
746 /* send_cq and recv_cq initialized below */
747 ep->rep_attr.srq = NULL;
748 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
749 switch (ia->ri_memreg_strategy) {
750 case RPCRDMA_FRMR: {
751 int depth = 7;
752
753 /* Add room for frmr register and invalidate WRs.
754 * 1. FRMR reg WR for head
755 * 2. FRMR invalidate WR for head
756 * 3. N FRMR reg WRs for pagelist
757 * 4. N FRMR invalidate WRs for pagelist
758 * 5. FRMR reg WR for tail
759 * 6. FRMR invalidate WR for tail
760 * 7. The RDMA_SEND WR
761 */
762
763 /* Calculate N if the device max FRMR depth is smaller than
764 * RPCRDMA_MAX_DATA_SEGS.
765 */
766 if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
767 int delta = RPCRDMA_MAX_DATA_SEGS -
768 ia->ri_max_frmr_depth;
769
770 do {
771 depth += 2; /* FRMR reg + invalidate */
772 delta -= ia->ri_max_frmr_depth;
773 } while (delta > 0);
774
775 }
776 ep->rep_attr.cap.max_send_wr *= depth;
777 if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
778 cdata->max_requests = devattr->max_qp_wr / depth;
779 if (!cdata->max_requests)
780 return -EINVAL;
781 ep->rep_attr.cap.max_send_wr = cdata->max_requests *
782 depth;
783 }
784 break;
785 }
786 default:
787 break;
788 }
789 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
790 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
791 ep->rep_attr.cap.max_recv_sge = 1;
792 ep->rep_attr.cap.max_inline_data = 0;
793 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
794 ep->rep_attr.qp_type = IB_QPT_RC;
795 ep->rep_attr.port_num = ~0;
796
797 if (cdata->padding) {
798 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
799 GFP_KERNEL);
800 if (IS_ERR(ep->rep_padbuf))
801 return PTR_ERR(ep->rep_padbuf);
802 } else
803 ep->rep_padbuf = NULL;
804
805 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
806 "iovs: send %d recv %d\n",
807 __func__,
808 ep->rep_attr.cap.max_send_wr,
809 ep->rep_attr.cap.max_recv_wr,
810 ep->rep_attr.cap.max_send_sge,
811 ep->rep_attr.cap.max_recv_sge);
812
813 /* set trigger for requesting send completion */
814 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
815 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
816 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
817 else if (ep->rep_cqinit <= 2)
818 ep->rep_cqinit = 0;
819 INIT_CQCOUNT(ep);
820 init_waitqueue_head(&ep->rep_connect_wait);
821 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
822
823 sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
824 rpcrdma_cq_async_error_upcall, ep,
825 ep->rep_attr.cap.max_send_wr + 1, 0);
826 if (IS_ERR(sendcq)) {
827 rc = PTR_ERR(sendcq);
828 dprintk("RPC: %s: failed to create send CQ: %i\n",
829 __func__, rc);
830 goto out1;
831 }
832
833 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
834 if (rc) {
835 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
836 __func__, rc);
837 goto out2;
838 }
839
840 recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
841 rpcrdma_cq_async_error_upcall, ep,
842 ep->rep_attr.cap.max_recv_wr + 1, 0);
843 if (IS_ERR(recvcq)) {
844 rc = PTR_ERR(recvcq);
845 dprintk("RPC: %s: failed to create recv CQ: %i\n",
846 __func__, rc);
847 goto out2;
848 }
849
850 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
851 if (rc) {
852 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
853 __func__, rc);
854 ib_destroy_cq(recvcq);
855 goto out2;
856 }
857
858 ep->rep_attr.send_cq = sendcq;
859 ep->rep_attr.recv_cq = recvcq;
860
861 /* Initialize cma parameters */
862
863 /* RPC/RDMA does not use private data */
864 ep->rep_remote_cma.private_data = NULL;
865 ep->rep_remote_cma.private_data_len = 0;
866
867 /* Client offers RDMA Read but does not initiate */
868 ep->rep_remote_cma.initiator_depth = 0;
869 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
870 ep->rep_remote_cma.responder_resources = 32;
871 else
872 ep->rep_remote_cma.responder_resources =
873 devattr->max_qp_rd_atom;
874
875 ep->rep_remote_cma.retry_count = 7;
876 ep->rep_remote_cma.flow_control = 0;
877 ep->rep_remote_cma.rnr_retry_count = 0;
878
879 return 0;
880
881 out2:
882 err = ib_destroy_cq(sendcq);
883 if (err)
884 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
885 __func__, err);
886 out1:
887 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
888 return rc;
889 }
890
891 /*
892 * rpcrdma_ep_destroy
893 *
894 * Disconnect and destroy endpoint. After this, the only
895 * valid operations on the ep are to free it (if dynamically
896 * allocated) or re-create it.
897 */
898 void
899 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
900 {
901 int rc;
902
903 dprintk("RPC: %s: entering, connected is %d\n",
904 __func__, ep->rep_connected);
905
906 cancel_delayed_work_sync(&ep->rep_connect_worker);
907
908 if (ia->ri_id->qp) {
909 rpcrdma_ep_disconnect(ep, ia);
910 rdma_destroy_qp(ia->ri_id);
911 ia->ri_id->qp = NULL;
912 }
913
914 rpcrdma_free_regbuf(ia, ep->rep_padbuf);
915
916 rpcrdma_clean_cq(ep->rep_attr.recv_cq);
917 rc = ib_destroy_cq(ep->rep_attr.recv_cq);
918 if (rc)
919 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
920 __func__, rc);
921
922 rpcrdma_clean_cq(ep->rep_attr.send_cq);
923 rc = ib_destroy_cq(ep->rep_attr.send_cq);
924 if (rc)
925 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
926 __func__, rc);
927 }
928
929 /*
930 * Connect unconnected endpoint.
931 */
932 int
933 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
934 {
935 struct rdma_cm_id *id, *old;
936 int rc = 0;
937 int retry_count = 0;
938
939 if (ep->rep_connected != 0) {
940 struct rpcrdma_xprt *xprt;
941 retry:
942 dprintk("RPC: %s: reconnecting...\n", __func__);
943
944 rpcrdma_ep_disconnect(ep, ia);
945 rpcrdma_flush_cqs(ep);
946
947 switch (ia->ri_memreg_strategy) {
948 case RPCRDMA_FRMR:
949 rpcrdma_reset_frmrs(ia);
950 break;
951 case RPCRDMA_MTHCAFMR:
952 rpcrdma_reset_fmrs(ia);
953 break;
954 case RPCRDMA_ALLPHYSICAL:
955 break;
956 default:
957 rc = -EIO;
958 goto out;
959 }
960
961 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
962 id = rpcrdma_create_id(xprt, ia,
963 (struct sockaddr *)&xprt->rx_data.addr);
964 if (IS_ERR(id)) {
965 rc = -EHOSTUNREACH;
966 goto out;
967 }
968 /* TEMP TEMP TEMP - fail if new device:
969 * Deregister/remarshal *all* requests!
970 * Close and recreate adapter, pd, etc!
971 * Re-determine all attributes still sane!
972 * More stuff I haven't thought of!
973 * Rrrgh!
974 */
975 if (ia->ri_id->device != id->device) {
976 printk("RPC: %s: can't reconnect on "
977 "different device!\n", __func__);
978 rdma_destroy_id(id);
979 rc = -ENETUNREACH;
980 goto out;
981 }
982 /* END TEMP */
983 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
984 if (rc) {
985 dprintk("RPC: %s: rdma_create_qp failed %i\n",
986 __func__, rc);
987 rdma_destroy_id(id);
988 rc = -ENETUNREACH;
989 goto out;
990 }
991
992 write_lock(&ia->ri_qplock);
993 old = ia->ri_id;
994 ia->ri_id = id;
995 write_unlock(&ia->ri_qplock);
996
997 rdma_destroy_qp(old);
998 rdma_destroy_id(old);
999 } else {
1000 dprintk("RPC: %s: connecting...\n", __func__);
1001 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
1002 if (rc) {
1003 dprintk("RPC: %s: rdma_create_qp failed %i\n",
1004 __func__, rc);
1005 /* do not update ep->rep_connected */
1006 return -ENETUNREACH;
1007 }
1008 }
1009
1010 ep->rep_connected = 0;
1011
1012 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
1013 if (rc) {
1014 dprintk("RPC: %s: rdma_connect() failed with %i\n",
1015 __func__, rc);
1016 goto out;
1017 }
1018
1019 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
1020
1021 /*
1022 * Check state. A non-peer reject indicates no listener
1023 * (ECONNREFUSED), which may be a transient state. All
1024 * others indicate a transport condition which has already
1025 * undergone a best-effort.
1026 */
1027 if (ep->rep_connected == -ECONNREFUSED &&
1028 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
1029 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
1030 goto retry;
1031 }
1032 if (ep->rep_connected <= 0) {
1033 /* Sometimes, the only way to reliably connect to remote
1034 * CMs is to use same nonzero values for ORD and IRD. */
1035 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
1036 (ep->rep_remote_cma.responder_resources == 0 ||
1037 ep->rep_remote_cma.initiator_depth !=
1038 ep->rep_remote_cma.responder_resources)) {
1039 if (ep->rep_remote_cma.responder_resources == 0)
1040 ep->rep_remote_cma.responder_resources = 1;
1041 ep->rep_remote_cma.initiator_depth =
1042 ep->rep_remote_cma.responder_resources;
1043 goto retry;
1044 }
1045 rc = ep->rep_connected;
1046 } else {
1047 dprintk("RPC: %s: connected\n", __func__);
1048 }
1049
1050 out:
1051 if (rc)
1052 ep->rep_connected = rc;
1053 return rc;
1054 }
1055
1056 /*
1057 * rpcrdma_ep_disconnect
1058 *
1059 * This is separate from destroy to facilitate the ability
1060 * to reconnect without recreating the endpoint.
1061 *
1062 * This call is not reentrant, and must not be made in parallel
1063 * on the same endpoint.
1064 */
1065 void
1066 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
1067 {
1068 int rc;
1069
1070 rpcrdma_flush_cqs(ep);
1071 rc = rdma_disconnect(ia->ri_id);
1072 if (!rc) {
1073 /* returns without wait if not connected */
1074 wait_event_interruptible(ep->rep_connect_wait,
1075 ep->rep_connected != 1);
1076 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
1077 (ep->rep_connected == 1) ? "still " : "dis");
1078 } else {
1079 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
1080 ep->rep_connected = rc;
1081 }
1082 }
1083
1084 static struct rpcrdma_req *
1085 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
1086 {
1087 struct rpcrdma_req *req;
1088
1089 req = kzalloc(sizeof(*req), GFP_KERNEL);
1090 if (req == NULL)
1091 return ERR_PTR(-ENOMEM);
1092
1093 req->rl_buffer = &r_xprt->rx_buf;
1094 return req;
1095 }
1096
1097 static struct rpcrdma_rep *
1098 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
1099 {
1100 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1101 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1102 struct rpcrdma_rep *rep;
1103 int rc;
1104
1105 rc = -ENOMEM;
1106 rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1107 if (rep == NULL)
1108 goto out;
1109
1110 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
1111 GFP_KERNEL);
1112 if (IS_ERR(rep->rr_rdmabuf)) {
1113 rc = PTR_ERR(rep->rr_rdmabuf);
1114 goto out_free;
1115 }
1116
1117 rep->rr_buffer = &r_xprt->rx_buf;
1118 return rep;
1119
1120 out_free:
1121 kfree(rep);
1122 out:
1123 return ERR_PTR(rc);
1124 }
1125
1126 static int
1127 rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1128 {
1129 int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
1130 struct ib_fmr_attr fmr_attr = {
1131 .max_pages = RPCRDMA_MAX_DATA_SEGS,
1132 .max_maps = 1,
1133 .page_shift = PAGE_SHIFT
1134 };
1135 struct rpcrdma_mw *r;
1136 int i, rc;
1137
1138 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1139 dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
1140
1141 while (i--) {
1142 r = kzalloc(sizeof(*r), GFP_KERNEL);
1143 if (r == NULL)
1144 return -ENOMEM;
1145
1146 r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
1147 if (IS_ERR(r->r.fmr)) {
1148 rc = PTR_ERR(r->r.fmr);
1149 dprintk("RPC: %s: ib_alloc_fmr failed %i\n",
1150 __func__, rc);
1151 goto out_free;
1152 }
1153
1154 list_add(&r->mw_list, &buf->rb_mws);
1155 list_add(&r->mw_all, &buf->rb_all);
1156 }
1157 return 0;
1158
1159 out_free:
1160 kfree(r);
1161 return rc;
1162 }
1163
1164 static int
1165 rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
1166 {
1167 struct rpcrdma_frmr *f;
1168 struct rpcrdma_mw *r;
1169 int i, rc;
1170
1171 i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
1172 dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
1173
1174 while (i--) {
1175 r = kzalloc(sizeof(*r), GFP_KERNEL);
1176 if (r == NULL)
1177 return -ENOMEM;
1178 f = &r->r.frmr;
1179
1180 f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1181 ia->ri_max_frmr_depth);
1182 if (IS_ERR(f->fr_mr)) {
1183 rc = PTR_ERR(f->fr_mr);
1184 dprintk("RPC: %s: ib_alloc_fast_reg_mr "
1185 "failed %i\n", __func__, rc);
1186 goto out_free;
1187 }
1188
1189 f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
1190 ia->ri_max_frmr_depth);
1191 if (IS_ERR(f->fr_pgl)) {
1192 rc = PTR_ERR(f->fr_pgl);
1193 dprintk("RPC: %s: ib_alloc_fast_reg_page_list "
1194 "failed %i\n", __func__, rc);
1195
1196 ib_dereg_mr(f->fr_mr);
1197 goto out_free;
1198 }
1199
1200 list_add(&r->mw_list, &buf->rb_mws);
1201 list_add(&r->mw_all, &buf->rb_all);
1202 }
1203
1204 return 0;
1205
1206 out_free:
1207 kfree(r);
1208 return rc;
1209 }
1210
1211 int
1212 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
1213 {
1214 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1215 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1216 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
1217 char *p;
1218 size_t len;
1219 int i, rc;
1220
1221 buf->rb_max_requests = cdata->max_requests;
1222 spin_lock_init(&buf->rb_lock);
1223
1224 /* Need to allocate:
1225 * 1. arrays for send and recv pointers
1226 * 2. arrays of struct rpcrdma_req to fill in pointers
1227 * 3. array of struct rpcrdma_rep for replies
1228 * Send/recv buffers in req/rep need to be registered
1229 */
1230 len = buf->rb_max_requests *
1231 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
1232
1233 p = kzalloc(len, GFP_KERNEL);
1234 if (p == NULL) {
1235 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1236 __func__, len);
1237 rc = -ENOMEM;
1238 goto out;
1239 }
1240 buf->rb_pool = p; /* for freeing it later */
1241
1242 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1243 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1244 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1245 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1246
1247 INIT_LIST_HEAD(&buf->rb_mws);
1248 INIT_LIST_HEAD(&buf->rb_all);
1249 switch (ia->ri_memreg_strategy) {
1250 case RPCRDMA_FRMR:
1251 rc = rpcrdma_init_frmrs(ia, buf);
1252 if (rc)
1253 goto out;
1254 break;
1255 case RPCRDMA_MTHCAFMR:
1256 rc = rpcrdma_init_fmrs(ia, buf);
1257 if (rc)
1258 goto out;
1259 break;
1260 default:
1261 break;
1262 }
1263
1264 for (i = 0; i < buf->rb_max_requests; i++) {
1265 struct rpcrdma_req *req;
1266 struct rpcrdma_rep *rep;
1267
1268 req = rpcrdma_create_req(r_xprt);
1269 if (IS_ERR(req)) {
1270 dprintk("RPC: %s: request buffer %d alloc"
1271 " failed\n", __func__, i);
1272 rc = PTR_ERR(req);
1273 goto out;
1274 }
1275 buf->rb_send_bufs[i] = req;
1276
1277 rep = rpcrdma_create_rep(r_xprt);
1278 if (IS_ERR(rep)) {
1279 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1280 __func__, i);
1281 rc = PTR_ERR(rep);
1282 goto out;
1283 }
1284 buf->rb_recv_bufs[i] = rep;
1285 }
1286
1287 return 0;
1288 out:
1289 rpcrdma_buffer_destroy(buf);
1290 return rc;
1291 }
1292
1293 static void
1294 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
1295 {
1296 if (!rep)
1297 return;
1298
1299 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
1300 kfree(rep);
1301 }
1302
1303 static void
1304 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
1305 {
1306 if (!req)
1307 return;
1308
1309 rpcrdma_free_regbuf(ia, req->rl_sendbuf);
1310 rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
1311 kfree(req);
1312 }
1313
1314 static void
1315 rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
1316 {
1317 struct rpcrdma_mw *r;
1318 int rc;
1319
1320 while (!list_empty(&buf->rb_all)) {
1321 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1322 list_del(&r->mw_all);
1323 list_del(&r->mw_list);
1324
1325 rc = ib_dealloc_fmr(r->r.fmr);
1326 if (rc)
1327 dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
1328 __func__, rc);
1329
1330 kfree(r);
1331 }
1332 }
1333
1334 static void
1335 rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
1336 {
1337 struct rpcrdma_mw *r;
1338 int rc;
1339
1340 while (!list_empty(&buf->rb_all)) {
1341 r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
1342 list_del(&r->mw_all);
1343 list_del(&r->mw_list);
1344
1345 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1346 if (rc)
1347 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1348 __func__, rc);
1349 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1350
1351 kfree(r);
1352 }
1353 }
1354
1355 void
1356 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1357 {
1358 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1359 int i;
1360
1361 /* clean up in reverse order from create
1362 * 1. recv mr memory (mr free, then kfree)
1363 * 2. send mr memory (mr free, then kfree)
1364 * 3. MWs
1365 */
1366 dprintk("RPC: %s: entering\n", __func__);
1367
1368 for (i = 0; i < buf->rb_max_requests; i++) {
1369 if (buf->rb_recv_bufs)
1370 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
1371 if (buf->rb_send_bufs)
1372 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
1373 }
1374
1375 switch (ia->ri_memreg_strategy) {
1376 case RPCRDMA_FRMR:
1377 rpcrdma_destroy_frmrs(buf);
1378 break;
1379 case RPCRDMA_MTHCAFMR:
1380 rpcrdma_destroy_fmrs(buf);
1381 break;
1382 default:
1383 break;
1384 }
1385
1386 kfree(buf->rb_pool);
1387 }
1388
1389 /* After a disconnect, unmap all FMRs.
1390 *
1391 * This is invoked only in the transport connect worker in order
1392 * to serialize with rpcrdma_register_fmr_external().
1393 */
1394 static void
1395 rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
1396 {
1397 struct rpcrdma_xprt *r_xprt =
1398 container_of(ia, struct rpcrdma_xprt, rx_ia);
1399 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1400 struct list_head *pos;
1401 struct rpcrdma_mw *r;
1402 LIST_HEAD(l);
1403 int rc;
1404
1405 list_for_each(pos, &buf->rb_all) {
1406 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1407
1408 INIT_LIST_HEAD(&l);
1409 list_add(&r->r.fmr->list, &l);
1410 rc = ib_unmap_fmr(&l);
1411 if (rc)
1412 dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
1413 __func__, rc);
1414 }
1415 }
1416
1417 /* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
1418 * an unusable state. Find FRMRs in this state and dereg / reg
1419 * each. FRMRs that are VALID and attached to an rpcrdma_req are
1420 * also torn down.
1421 *
1422 * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
1423 *
1424 * This is invoked only in the transport connect worker in order
1425 * to serialize with rpcrdma_register_frmr_external().
1426 */
1427 static void
1428 rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
1429 {
1430 struct rpcrdma_xprt *r_xprt =
1431 container_of(ia, struct rpcrdma_xprt, rx_ia);
1432 struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
1433 struct list_head *pos;
1434 struct rpcrdma_mw *r;
1435 int rc;
1436
1437 list_for_each(pos, &buf->rb_all) {
1438 r = list_entry(pos, struct rpcrdma_mw, mw_all);
1439
1440 if (r->r.frmr.fr_state == FRMR_IS_INVALID)
1441 continue;
1442
1443 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1444 if (rc)
1445 dprintk("RPC: %s: ib_dereg_mr failed %i\n",
1446 __func__, rc);
1447 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1448
1449 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1450 ia->ri_max_frmr_depth);
1451 if (IS_ERR(r->r.frmr.fr_mr)) {
1452 rc = PTR_ERR(r->r.frmr.fr_mr);
1453 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1454 " failed %i\n", __func__, rc);
1455 continue;
1456 }
1457 r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
1458 ia->ri_id->device,
1459 ia->ri_max_frmr_depth);
1460 if (IS_ERR(r->r.frmr.fr_pgl)) {
1461 rc = PTR_ERR(r->r.frmr.fr_pgl);
1462 dprintk("RPC: %s: "
1463 "ib_alloc_fast_reg_page_list "
1464 "failed %i\n", __func__, rc);
1465
1466 ib_dereg_mr(r->r.frmr.fr_mr);
1467 continue;
1468 }
1469 r->r.frmr.fr_state = FRMR_IS_INVALID;
1470 }
1471 }
1472
1473 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
1474 * some req segments uninitialized.
1475 */
1476 static void
1477 rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
1478 {
1479 if (*mw) {
1480 list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
1481 *mw = NULL;
1482 }
1483 }
1484
1485 /* Cycle mw's back in reverse order, and "spin" them.
1486 * This delays and scrambles reuse as much as possible.
1487 */
1488 static void
1489 rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1490 {
1491 struct rpcrdma_mr_seg *seg = req->rl_segments;
1492 struct rpcrdma_mr_seg *seg1 = seg;
1493 int i;
1494
1495 for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
1496 rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
1497 rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
1498 }
1499
1500 static void
1501 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1502 {
1503 buf->rb_send_bufs[--buf->rb_send_index] = req;
1504 req->rl_niovs = 0;
1505 if (req->rl_reply) {
1506 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
1507 req->rl_reply->rr_func = NULL;
1508 req->rl_reply = NULL;
1509 }
1510 }
1511
1512 /* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
1513 * Redo only the ib_post_send().
1514 */
1515 static void
1516 rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
1517 {
1518 struct rpcrdma_xprt *r_xprt =
1519 container_of(ia, struct rpcrdma_xprt, rx_ia);
1520 struct ib_send_wr invalidate_wr, *bad_wr;
1521 int rc;
1522
1523 dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
1524
1525 /* When this FRMR is re-inserted into rb_mws, it is no longer stale */
1526 r->r.frmr.fr_state = FRMR_IS_INVALID;
1527
1528 memset(&invalidate_wr, 0, sizeof(invalidate_wr));
1529 invalidate_wr.wr_id = (unsigned long)(void *)r;
1530 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1531 invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
1532 DECR_CQCOUNT(&r_xprt->rx_ep);
1533
1534 dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
1535 __func__, r, r->r.frmr.fr_mr->rkey);
1536
1537 read_lock(&ia->ri_qplock);
1538 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1539 read_unlock(&ia->ri_qplock);
1540 if (rc) {
1541 /* Force rpcrdma_buffer_get() to retry */
1542 r->r.frmr.fr_state = FRMR_IS_STALE;
1543 dprintk("RPC: %s: ib_post_send failed, %i\n",
1544 __func__, rc);
1545 }
1546 }
1547
1548 static void
1549 rpcrdma_retry_flushed_linv(struct list_head *stale,
1550 struct rpcrdma_buffer *buf)
1551 {
1552 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1553 struct list_head *pos;
1554 struct rpcrdma_mw *r;
1555 unsigned long flags;
1556
1557 list_for_each(pos, stale) {
1558 r = list_entry(pos, struct rpcrdma_mw, mw_list);
1559 rpcrdma_retry_local_inv(r, ia);
1560 }
1561
1562 spin_lock_irqsave(&buf->rb_lock, flags);
1563 list_splice_tail(stale, &buf->rb_mws);
1564 spin_unlock_irqrestore(&buf->rb_lock, flags);
1565 }
1566
1567 static struct rpcrdma_req *
1568 rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
1569 struct list_head *stale)
1570 {
1571 struct rpcrdma_mw *r;
1572 int i;
1573
1574 i = RPCRDMA_MAX_SEGS - 1;
1575 while (!list_empty(&buf->rb_mws)) {
1576 r = list_entry(buf->rb_mws.next,
1577 struct rpcrdma_mw, mw_list);
1578 list_del(&r->mw_list);
1579 if (r->r.frmr.fr_state == FRMR_IS_STALE) {
1580 list_add(&r->mw_list, stale);
1581 continue;
1582 }
1583 req->rl_segments[i].rl_mw = r;
1584 if (unlikely(i-- == 0))
1585 return req; /* Success */
1586 }
1587
1588 /* Not enough entries on rb_mws for this req */
1589 rpcrdma_buffer_put_sendbuf(req, buf);
1590 rpcrdma_buffer_put_mrs(req, buf);
1591 return NULL;
1592 }
1593
1594 static struct rpcrdma_req *
1595 rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
1596 {
1597 struct rpcrdma_mw *r;
1598 int i;
1599
1600 i = RPCRDMA_MAX_SEGS - 1;
1601 while (!list_empty(&buf->rb_mws)) {
1602 r = list_entry(buf->rb_mws.next,
1603 struct rpcrdma_mw, mw_list);
1604 list_del(&r->mw_list);
1605 req->rl_segments[i].rl_mw = r;
1606 if (unlikely(i-- == 0))
1607 return req; /* Success */
1608 }
1609
1610 /* Not enough entries on rb_mws for this req */
1611 rpcrdma_buffer_put_sendbuf(req, buf);
1612 rpcrdma_buffer_put_mrs(req, buf);
1613 return NULL;
1614 }
1615
1616 /*
1617 * Get a set of request/reply buffers.
1618 *
1619 * Reply buffer (if needed) is attached to send buffer upon return.
1620 * Rule:
1621 * rb_send_index and rb_recv_index MUST always be pointing to the
1622 * *next* available buffer (non-NULL). They are incremented after
1623 * removing buffers, and decremented *before* returning them.
1624 */
1625 struct rpcrdma_req *
1626 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1627 {
1628 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1629 struct list_head stale;
1630 struct rpcrdma_req *req;
1631 unsigned long flags;
1632
1633 spin_lock_irqsave(&buffers->rb_lock, flags);
1634 if (buffers->rb_send_index == buffers->rb_max_requests) {
1635 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1636 dprintk("RPC: %s: out of request buffers\n", __func__);
1637 return ((struct rpcrdma_req *)NULL);
1638 }
1639
1640 req = buffers->rb_send_bufs[buffers->rb_send_index];
1641 if (buffers->rb_send_index < buffers->rb_recv_index) {
1642 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1643 __func__,
1644 buffers->rb_recv_index - buffers->rb_send_index);
1645 req->rl_reply = NULL;
1646 } else {
1647 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1648 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1649 }
1650 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1651
1652 INIT_LIST_HEAD(&stale);
1653 switch (ia->ri_memreg_strategy) {
1654 case RPCRDMA_FRMR:
1655 req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
1656 break;
1657 case RPCRDMA_MTHCAFMR:
1658 req = rpcrdma_buffer_get_fmrs(req, buffers);
1659 break;
1660 default:
1661 break;
1662 }
1663 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1664 if (!list_empty(&stale))
1665 rpcrdma_retry_flushed_linv(&stale, buffers);
1666 return req;
1667 }
1668
1669 /*
1670 * Put request/reply buffers back into pool.
1671 * Pre-decrement counter/array index.
1672 */
1673 void
1674 rpcrdma_buffer_put(struct rpcrdma_req *req)
1675 {
1676 struct rpcrdma_buffer *buffers = req->rl_buffer;
1677 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1678 unsigned long flags;
1679
1680 spin_lock_irqsave(&buffers->rb_lock, flags);
1681 rpcrdma_buffer_put_sendbuf(req, buffers);
1682 switch (ia->ri_memreg_strategy) {
1683 case RPCRDMA_FRMR:
1684 case RPCRDMA_MTHCAFMR:
1685 rpcrdma_buffer_put_mrs(req, buffers);
1686 break;
1687 default:
1688 break;
1689 }
1690 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1691 }
1692
1693 /*
1694 * Recover reply buffers from pool.
1695 * This happens when recovering from error conditions.
1696 * Post-increment counter/array index.
1697 */
1698 void
1699 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1700 {
1701 struct rpcrdma_buffer *buffers = req->rl_buffer;
1702 unsigned long flags;
1703
1704 spin_lock_irqsave(&buffers->rb_lock, flags);
1705 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1706 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1707 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1708 }
1709 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1710 }
1711
1712 /*
1713 * Put reply buffers back into pool when not attached to
1714 * request. This happens in error conditions.
1715 */
1716 void
1717 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1718 {
1719 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1720 unsigned long flags;
1721
1722 rep->rr_func = NULL;
1723 spin_lock_irqsave(&buffers->rb_lock, flags);
1724 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1725 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1726 }
1727
1728 /*
1729 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1730 */
1731
1732 static int
1733 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1734 struct ib_mr **mrp, struct ib_sge *iov)
1735 {
1736 struct ib_phys_buf ipb;
1737 struct ib_mr *mr;
1738 int rc;
1739
1740 /*
1741 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1742 */
1743 iov->addr = ib_dma_map_single(ia->ri_id->device,
1744 va, len, DMA_BIDIRECTIONAL);
1745 if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
1746 return -ENOMEM;
1747
1748 iov->length = len;
1749
1750 if (ia->ri_have_dma_lkey) {
1751 *mrp = NULL;
1752 iov->lkey = ia->ri_dma_lkey;
1753 return 0;
1754 } else if (ia->ri_bind_mem != NULL) {
1755 *mrp = NULL;
1756 iov->lkey = ia->ri_bind_mem->lkey;
1757 return 0;
1758 }
1759
1760 ipb.addr = iov->addr;
1761 ipb.size = iov->length;
1762 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1763 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1764
1765 dprintk("RPC: %s: phys convert: 0x%llx "
1766 "registered 0x%llx length %d\n",
1767 __func__, (unsigned long long)ipb.addr,
1768 (unsigned long long)iov->addr, len);
1769
1770 if (IS_ERR(mr)) {
1771 *mrp = NULL;
1772 rc = PTR_ERR(mr);
1773 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1774 } else {
1775 *mrp = mr;
1776 iov->lkey = mr->lkey;
1777 rc = 0;
1778 }
1779
1780 return rc;
1781 }
1782
1783 static int
1784 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1785 struct ib_mr *mr, struct ib_sge *iov)
1786 {
1787 int rc;
1788
1789 ib_dma_unmap_single(ia->ri_id->device,
1790 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1791
1792 if (NULL == mr)
1793 return 0;
1794
1795 rc = ib_dereg_mr(mr);
1796 if (rc)
1797 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1798 return rc;
1799 }
1800
1801 /**
1802 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
1803 * @ia: controlling rpcrdma_ia
1804 * @size: size of buffer to be allocated, in bytes
1805 * @flags: GFP flags
1806 *
1807 * Returns pointer to private header of an area of internally
1808 * registered memory, or an ERR_PTR. The registered buffer follows
1809 * the end of the private header.
1810 *
1811 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
1812 * receiving the payload of RDMA RECV operations. regbufs are not
1813 * used for RDMA READ/WRITE operations, thus are registered only for
1814 * LOCAL access.
1815 */
1816 struct rpcrdma_regbuf *
1817 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
1818 {
1819 struct rpcrdma_regbuf *rb;
1820 int rc;
1821
1822 rc = -ENOMEM;
1823 rb = kmalloc(sizeof(*rb) + size, flags);
1824 if (rb == NULL)
1825 goto out;
1826
1827 rb->rg_size = size;
1828 rb->rg_owner = NULL;
1829 rc = rpcrdma_register_internal(ia, rb->rg_base, size,
1830 &rb->rg_mr, &rb->rg_iov);
1831 if (rc)
1832 goto out_free;
1833
1834 return rb;
1835
1836 out_free:
1837 kfree(rb);
1838 out:
1839 return ERR_PTR(rc);
1840 }
1841
1842 /**
1843 * rpcrdma_free_regbuf - deregister and free registered buffer
1844 * @ia: controlling rpcrdma_ia
1845 * @rb: regbuf to be deregistered and freed
1846 */
1847 void
1848 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
1849 {
1850 if (rb) {
1851 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
1852 kfree(rb);
1853 }
1854 }
1855
1856 /*
1857 * Wrappers for chunk registration, shared by read/write chunk code.
1858 */
1859
1860 static void
1861 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1862 {
1863 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1864 seg->mr_dmalen = seg->mr_len;
1865 if (seg->mr_page)
1866 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1867 seg->mr_page, offset_in_page(seg->mr_offset),
1868 seg->mr_dmalen, seg->mr_dir);
1869 else
1870 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1871 seg->mr_offset,
1872 seg->mr_dmalen, seg->mr_dir);
1873 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1874 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1875 __func__,
1876 (unsigned long long)seg->mr_dma,
1877 seg->mr_offset, seg->mr_dmalen);
1878 }
1879 }
1880
1881 static void
1882 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1883 {
1884 if (seg->mr_page)
1885 ib_dma_unmap_page(ia->ri_id->device,
1886 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1887 else
1888 ib_dma_unmap_single(ia->ri_id->device,
1889 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1890 }
1891
1892 static int
1893 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1894 int *nsegs, int writing, struct rpcrdma_ia *ia,
1895 struct rpcrdma_xprt *r_xprt)
1896 {
1897 struct rpcrdma_mr_seg *seg1 = seg;
1898 struct rpcrdma_mw *mw = seg1->rl_mw;
1899 struct rpcrdma_frmr *frmr = &mw->r.frmr;
1900 struct ib_mr *mr = frmr->fr_mr;
1901 struct ib_send_wr fastreg_wr, *bad_wr;
1902 u8 key;
1903 int len, pageoff;
1904 int i, rc;
1905 int seg_len;
1906 u64 pa;
1907 int page_no;
1908
1909 pageoff = offset_in_page(seg1->mr_offset);
1910 seg1->mr_offset -= pageoff; /* start of page */
1911 seg1->mr_len += pageoff;
1912 len = -pageoff;
1913 if (*nsegs > ia->ri_max_frmr_depth)
1914 *nsegs = ia->ri_max_frmr_depth;
1915 for (page_no = i = 0; i < *nsegs;) {
1916 rpcrdma_map_one(ia, seg, writing);
1917 pa = seg->mr_dma;
1918 for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
1919 frmr->fr_pgl->page_list[page_no++] = pa;
1920 pa += PAGE_SIZE;
1921 }
1922 len += seg->mr_len;
1923 ++seg;
1924 ++i;
1925 /* Check for holes */
1926 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1927 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1928 break;
1929 }
1930 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1931 __func__, mw, i);
1932
1933 frmr->fr_state = FRMR_IS_VALID;
1934
1935 memset(&fastreg_wr, 0, sizeof(fastreg_wr));
1936 fastreg_wr.wr_id = (unsigned long)(void *)mw;
1937 fastreg_wr.opcode = IB_WR_FAST_REG_MR;
1938 fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1939 fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
1940 fastreg_wr.wr.fast_reg.page_list_len = page_no;
1941 fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1942 fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
1943 if (fastreg_wr.wr.fast_reg.length < len) {
1944 rc = -EIO;
1945 goto out_err;
1946 }
1947
1948 /* Bump the key */
1949 key = (u8)(mr->rkey & 0x000000FF);
1950 ib_update_fast_reg_key(mr, ++key);
1951
1952 fastreg_wr.wr.fast_reg.access_flags = (writing ?
1953 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1954 IB_ACCESS_REMOTE_READ);
1955 fastreg_wr.wr.fast_reg.rkey = mr->rkey;
1956 DECR_CQCOUNT(&r_xprt->rx_ep);
1957
1958 rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
1959 if (rc) {
1960 dprintk("RPC: %s: failed ib_post_send for register,"
1961 " status %i\n", __func__, rc);
1962 ib_update_fast_reg_key(mr, --key);
1963 goto out_err;
1964 } else {
1965 seg1->mr_rkey = mr->rkey;
1966 seg1->mr_base = seg1->mr_dma + pageoff;
1967 seg1->mr_nsegs = i;
1968 seg1->mr_len = len;
1969 }
1970 *nsegs = i;
1971 return 0;
1972 out_err:
1973 frmr->fr_state = FRMR_IS_INVALID;
1974 while (i--)
1975 rpcrdma_unmap_one(ia, --seg);
1976 return rc;
1977 }
1978
1979 static int
1980 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1981 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1982 {
1983 struct rpcrdma_mr_seg *seg1 = seg;
1984 struct ib_send_wr invalidate_wr, *bad_wr;
1985 int rc;
1986
1987 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
1988
1989 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1990 invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
1991 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1992 invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
1993 DECR_CQCOUNT(&r_xprt->rx_ep);
1994
1995 read_lock(&ia->ri_qplock);
1996 while (seg1->mr_nsegs--)
1997 rpcrdma_unmap_one(ia, seg++);
1998 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1999 read_unlock(&ia->ri_qplock);
2000 if (rc) {
2001 /* Force rpcrdma_buffer_get() to retry */
2002 seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
2003 dprintk("RPC: %s: failed ib_post_send for invalidate,"
2004 " status %i\n", __func__, rc);
2005 }
2006 return rc;
2007 }
2008
2009 static int
2010 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
2011 int *nsegs, int writing, struct rpcrdma_ia *ia)
2012 {
2013 struct rpcrdma_mr_seg *seg1 = seg;
2014 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
2015 int len, pageoff, i, rc;
2016
2017 pageoff = offset_in_page(seg1->mr_offset);
2018 seg1->mr_offset -= pageoff; /* start of page */
2019 seg1->mr_len += pageoff;
2020 len = -pageoff;
2021 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
2022 *nsegs = RPCRDMA_MAX_DATA_SEGS;
2023 for (i = 0; i < *nsegs;) {
2024 rpcrdma_map_one(ia, seg, writing);
2025 physaddrs[i] = seg->mr_dma;
2026 len += seg->mr_len;
2027 ++seg;
2028 ++i;
2029 /* Check for holes */
2030 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
2031 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
2032 break;
2033 }
2034 rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
2035 if (rc) {
2036 dprintk("RPC: %s: failed ib_map_phys_fmr "
2037 "%u@0x%llx+%i (%d)... status %i\n", __func__,
2038 len, (unsigned long long)seg1->mr_dma,
2039 pageoff, i, rc);
2040 while (i--)
2041 rpcrdma_unmap_one(ia, --seg);
2042 } else {
2043 seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
2044 seg1->mr_base = seg1->mr_dma + pageoff;
2045 seg1->mr_nsegs = i;
2046 seg1->mr_len = len;
2047 }
2048 *nsegs = i;
2049 return rc;
2050 }
2051
2052 static int
2053 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
2054 struct rpcrdma_ia *ia)
2055 {
2056 struct rpcrdma_mr_seg *seg1 = seg;
2057 LIST_HEAD(l);
2058 int rc;
2059
2060 list_add(&seg1->rl_mw->r.fmr->list, &l);
2061 rc = ib_unmap_fmr(&l);
2062 read_lock(&ia->ri_qplock);
2063 while (seg1->mr_nsegs--)
2064 rpcrdma_unmap_one(ia, seg++);
2065 read_unlock(&ia->ri_qplock);
2066 if (rc)
2067 dprintk("RPC: %s: failed ib_unmap_fmr,"
2068 " status %i\n", __func__, rc);
2069 return rc;
2070 }
2071
2072 int
2073 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
2074 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
2075 {
2076 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2077 int rc = 0;
2078
2079 switch (ia->ri_memreg_strategy) {
2080
2081 case RPCRDMA_ALLPHYSICAL:
2082 rpcrdma_map_one(ia, seg, writing);
2083 seg->mr_rkey = ia->ri_bind_mem->rkey;
2084 seg->mr_base = seg->mr_dma;
2085 seg->mr_nsegs = 1;
2086 nsegs = 1;
2087 break;
2088
2089 /* Registration using frmr registration */
2090 case RPCRDMA_FRMR:
2091 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
2092 break;
2093
2094 /* Registration using fmr memory registration */
2095 case RPCRDMA_MTHCAFMR:
2096 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
2097 break;
2098
2099 default:
2100 return -EIO;
2101 }
2102 if (rc)
2103 return rc;
2104
2105 return nsegs;
2106 }
2107
2108 int
2109 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
2110 struct rpcrdma_xprt *r_xprt)
2111 {
2112 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
2113 int nsegs = seg->mr_nsegs, rc;
2114
2115 switch (ia->ri_memreg_strategy) {
2116
2117 case RPCRDMA_ALLPHYSICAL:
2118 read_lock(&ia->ri_qplock);
2119 rpcrdma_unmap_one(ia, seg);
2120 read_unlock(&ia->ri_qplock);
2121 break;
2122
2123 case RPCRDMA_FRMR:
2124 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
2125 break;
2126
2127 case RPCRDMA_MTHCAFMR:
2128 rc = rpcrdma_deregister_fmr_external(seg, ia);
2129 break;
2130
2131 default:
2132 break;
2133 }
2134 return nsegs;
2135 }
2136
2137 /*
2138 * Prepost any receive buffer, then post send.
2139 *
2140 * Receive buffer is donated to hardware, reclaimed upon recv completion.
2141 */
2142 int
2143 rpcrdma_ep_post(struct rpcrdma_ia *ia,
2144 struct rpcrdma_ep *ep,
2145 struct rpcrdma_req *req)
2146 {
2147 struct ib_send_wr send_wr, *send_wr_fail;
2148 struct rpcrdma_rep *rep = req->rl_reply;
2149 int rc;
2150
2151 if (rep) {
2152 rc = rpcrdma_ep_post_recv(ia, ep, rep);
2153 if (rc)
2154 goto out;
2155 req->rl_reply = NULL;
2156 }
2157
2158 send_wr.next = NULL;
2159 send_wr.wr_id = 0ULL; /* no send cookie */
2160 send_wr.sg_list = req->rl_send_iov;
2161 send_wr.num_sge = req->rl_niovs;
2162 send_wr.opcode = IB_WR_SEND;
2163 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
2164 ib_dma_sync_single_for_device(ia->ri_id->device,
2165 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
2166 DMA_TO_DEVICE);
2167 ib_dma_sync_single_for_device(ia->ri_id->device,
2168 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
2169 DMA_TO_DEVICE);
2170 ib_dma_sync_single_for_device(ia->ri_id->device,
2171 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
2172 DMA_TO_DEVICE);
2173
2174 if (DECR_CQCOUNT(ep) > 0)
2175 send_wr.send_flags = 0;
2176 else { /* Provider must take a send completion every now and then */
2177 INIT_CQCOUNT(ep);
2178 send_wr.send_flags = IB_SEND_SIGNALED;
2179 }
2180
2181 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
2182 if (rc)
2183 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
2184 rc);
2185 out:
2186 return rc;
2187 }
2188
2189 /*
2190 * (Re)post a receive buffer.
2191 */
2192 int
2193 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
2194 struct rpcrdma_ep *ep,
2195 struct rpcrdma_rep *rep)
2196 {
2197 struct ib_recv_wr recv_wr, *recv_wr_fail;
2198 int rc;
2199
2200 recv_wr.next = NULL;
2201 recv_wr.wr_id = (u64) (unsigned long) rep;
2202 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
2203 recv_wr.num_sge = 1;
2204
2205 ib_dma_sync_single_for_cpu(ia->ri_id->device,
2206 rdmab_addr(rep->rr_rdmabuf),
2207 rdmab_length(rep->rr_rdmabuf),
2208 DMA_BIDIRECTIONAL);
2209
2210 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
2211
2212 if (rc)
2213 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
2214 rc);
2215 return rc;
2216 }
2217
2218 /* Physical mapping means one Read/Write list entry per-page.
2219 * All list entries must fit within an inline buffer
2220 *
2221 * NB: The server must return a Write list for NFS READ,
2222 * which has the same constraint. Factor in the inline
2223 * rsize as well.
2224 */
2225 static size_t
2226 rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
2227 {
2228 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
2229 unsigned int inline_size, pages;
2230
2231 inline_size = min_t(unsigned int,
2232 cdata->inline_wsize, cdata->inline_rsize);
2233 inline_size -= RPCRDMA_HDRLEN_MIN;
2234 pages = inline_size / sizeof(struct rpcrdma_segment);
2235 return pages << PAGE_SHIFT;
2236 }
2237
2238 static size_t
2239 rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
2240 {
2241 return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
2242 }
2243
2244 size_t
2245 rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
2246 {
2247 size_t result;
2248
2249 switch (r_xprt->rx_ia.ri_memreg_strategy) {
2250 case RPCRDMA_ALLPHYSICAL:
2251 result = rpcrdma_physical_max_payload(r_xprt);
2252 break;
2253 default:
2254 result = rpcrdma_mr_max_payload(r_xprt);
2255 }
2256 return result;
2257 }